Source code for src.models.face_recognition

import os
import pickle
import logging
import numpy as np
import cv2
from deepface import DeepFace
from PIL import Image
from deepface.commons import functions
from facenet_pytorch import MTCNN
from src.preprocessing.facial_preprocessing import face_alignment
from src.utils.utils import image_files_in_folder

LOGGER = logging.getLogger(__name__)


[docs]class FaceRecognition(object): """ Allows to recognize faces in videos """ def __init__(self, thumbnail_list: list = None, thumbnails_path: str = 'data/thumbnails/thumbnails', img_width: int = 500, encoder_name: str = 'Dlib', labels_path: str = 'data/embeddings/labels.pickle', embeddings_path: str = 'data/embeddings/embeddings.pickle'): """ create or load kg_encodings. create detector, encoder Args: thumbnail_list (list): For sample use. thumbnails_path (str): Path to thumbnail directory. img_width (int): Scale the image to fixed new width. encoder_name (int): Options are "VGG-Face", "Facenet", "OpenFace", "DeepFace", "DeepID", "ArcFace", "Dlib". labels_path (str): Path to save and load thumbnail labels embeddings_path (str): Path to save and load thumbnail embeddings """ self.thumbnail_list = thumbnail_list self.thumbnails_path = thumbnails_path self.img_width = img_width self.labels_path = labels_path self.embeddings_path = embeddings_path self.detector = MTCNN(keep_all=True, post_process=False, device='cuda:0') self.encoder = DeepFace.build_model(encoder_name) self.target = functions.find_input_shape(self.encoder) # (150,150) encoder input shape self.labels, self.embeddings = self.load_embeddings() # store the 2 lists in labels.pickle encoddings.pickle
[docs] def recognize_video(self, video_path: str, recognizer_model=None, distance_threshold=0.6, by='second'): """ recognize faces on a frame or second level Args: video_path (str): Path to the video. recognizer_model (any model): Model trained with embeddings to predict entities. distance_threshold (float): The threshold below which recognitions are marked as unknown. by (str): Recognize by 'second' or 'frame'. Returns: frame_faces_list (list): List of recognized entities per frame/second. detected_faces (list): List of identical entities. timestamps (float): The corresponding timestamps to the detections. """ if not os.path.exists(video_path): LOGGER.info(f'{video_path} does not exists') LOGGER.info(f'Starting face recognition on {video_path}') video = cv2.VideoCapture(video_path) fps = video.get(cv2.CAP_PROP_FPS) frame_number = 0 # for recognize by second timestamps = [] frame_faces_list = [] # for batch processing frames = [] batch_size = 128 success, frame = video.read() while success: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # scale the frame w, h = frame.shape[1], frame.shape[0] if w > self.img_width: r = self.img_width / w dsize = (self.img_width, int(h * r)) frame = cv2.resize(frame, dsize) frames.append(frame) # batch detect if len(frames) == batch_size: frame_faces_list.extend(self.batch_recognize_images(frames, recognizer_model, distance_threshold)) frames.clear() # detected_faces = self.recognize_image(frame, recognizer_model) # frame_faces_list.append(detected_faces) if by == 'frame': timestamp = (timestamps[-1] + 1000 / fps) if timestamps else 0.0 timestamps.append(timestamp) success, frame = video.read() else: # by second timestamp = (timestamps[-1] + 1000) if timestamps else 0.0 timestamps.append(timestamp) frame_number += fps video.set(cv2.CAP_PROP_POS_FRAMES, frame_number) success, image = video.read() if len(frames) == 1: frame_faces_list.append(self.recognize_image(frames[0], recognizer_model)) elif len(frames) > 1: frame_faces_list.extend(self.batch_recognize_images(frames, recognizer_model)) frames.clear() detected_faces = {entity for l in frame_faces_list for entity in l} return detected_faces, frame_faces_list, timestamps
[docs] def batch_recognize_images(self, unknown_imgs: list, recognizer_model=None, distance_threshold=0.6): """ Recognize entities in batches of embeddings Args: unknown_imgs (list): List of embeddings. recognizer_model (any model): Model trained with embeddings to predict entities. distance_threshold (float): The threshold below which recognitions are marked as unknown. Returns: detected_faces (list): List of detected entities. """ detected_faces = [] embeddings = self.batch_represent(unknown_imgs) # recognize img by frame for frame_embeddings in embeddings: detected_faces.append(self.recognize_image(frame_embeddings, recognizer_model, distance_threshold)) return detected_faces
[docs] def batch_represent(self, imgs: list): """ create embeddings from images in batches Args: imgs (list): List of frames. Returns: embeddings: List of face embeddings. """ embeddings = [] mtcnn_imput = [Image.fromarray(img) for img in imgs] boxes, confidence, keypoints = self.detector.detect(mtcnn_imput, landmarks=True) frames_faces_detection = [] for i in range(len(boxes)): # there is face in the frame if boxes[i] is not None: frame_faces = [{ 'box': [box[0], box[1], box[2] - box[0], box[3] - box[1]], 'confidence': confidence, 'keypoints': { 'left_eye': tuple(keypoints[0]), 'right_eye': tuple(keypoints[1]), 'nose': tuple(keypoints[2]), 'mouth_left': tuple(keypoints[3]), 'mouth_right': tuple(keypoints[4]), }} for box, confidence, keypoints in zip(boxes[i], confidence[i], keypoints[i])] frames_faces_detection.append(frame_faces) # there is no face in the frame else: frames_faces_detection.append([]) aligned_faces = [] for img, frame_faces in zip(imgs, frames_faces_detection): # per frame, align face frame_aligned_faces = [] for face in frame_faces: # align face aligned_face = face_alignment(img, self.target, face['keypoints']) frame_aligned_faces.append(aligned_face) aligned_faces.append(frame_aligned_faces) flat_aligned_faces = [face for l in aligned_faces for face in l] # batch encoding if len(flat_aligned_faces) > 1: # otherwise, no face in the batch flat_aligned_faces = np.array(flat_aligned_faces) flat_embeddings = self.encoder.predict(flat_aligned_faces) count = 0 for i in range(len(aligned_faces)): frame_embeddings = [] for j in range(len(aligned_faces[i])): frame_embeddings.append(flat_embeddings[count]) count += 1 embeddings.append(frame_embeddings) return embeddings
[docs] def create_embeddings(self): """ create and save face embeddings and entity labels Returns: embeddings (list): List of face embeddings. labels (list): List of entity names. """ entity_dir_list = os.listdir(self.thumbnails_path) embeddings = [] labels = [] if self.thumbnail_list is not None: entity_dir_list = self.thumbnail_list for entity_dir in entity_dir_list: # for every celebrity: format of dir: ID_Name entity_id, entity_name = entity_dir.split('_') entity_path = os.path.join(self.thumbnails_path, entity_dir) if not os.path.isdir(entity_path): continue for img_path in image_files_in_folder( entity_path): # for every img of celebrity, exactly one face in one pic LOGGER.info(f'Encoding {entity_dir}, thumbnail: {img_path}') entity_embedding = self.represent(img_path, one_face=True) if not entity_embedding: LOGGER.warning(f'Could not create encoding for image {img_path}') continue if len(entity_embedding) > 1: LOGGER.warning(f'There are more than one faces in image {img_path}') continue embeddings.append(entity_embedding[0]) labels.append(entity_dir.split('_')[0]) # write to disk with open(self.labels_path, 'wb') as f: f.write(pickle.dumps(labels)) with open(self.embeddings_path, 'wb') as f: f.write(pickle.dumps(embeddings)) return labels, embeddings
[docs] def load_embeddings(self): """ Loads already existing embeddings Returns: labels (list): List of entity names. embeddings (list): List of face embeddings. """ if os.path.exists(self.labels_path) and os.path.exists(self.embeddings_path): labels = pickle.loads(open(self.labels_path, "rb").read()) embeddings = pickle.loads(open(self.embeddings_path, "rb").read()) return labels, embeddings return self.create_embeddings()
[docs] def recognize_image(self, unknown_img, recognizer_model=None, distance_threshold=0.6): """ Recognize entities in an image Args: unknown_img (image_path or image object): The image to detect entities in. recognizer_model (any model): Model trained with embeddings to predict entities. distance_threshold (float): The threshold below which recognitions are marked as unknown. Returns: detected_faces (list): List of detected entities. """ detected_faces = [] unknown_img_embeddings = None if isinstance(unknown_img, list): # batch unknown_img_embeddings = unknown_img else: # encode single image unknown_img_embeddings = self.represent(unknown_img) for unknown_img_embedding in unknown_img_embeddings: # for each face in the image if not recognizer_model: # run basic recognition a = np.matmul(self.embeddings, unknown_img_embedding) b = np.linalg.norm(self.embeddings, axis=1) c = np.linalg.norm(unknown_img_embedding) face_distances = 1 - a / (b * c) min_distance = np.min(face_distances) if min_distance < distance_threshold: entity = self.labels[np.argmin(face_distances)] detected_faces.append(entity) else: # LOGGER.info('face detected but no match') detected_faces.append('unknown') else: # call ANN if not recognizer_model.fitted: recognizer_model.fit(embeddings=self.embeddings, labels=self.labels) entity = recognizer_model.predict(unknown_img_embedding) if entity: detected_faces.append(entity) return detected_faces
[docs] def represent(self, img, one_face=False, return_face_number=False): """ create an embedding from an image Args: img (img object | img_path): The image to create the embedding for. one_face (bool): If only the largest face should be considered. return_face_number (bool): If the number of faces should be returned for distance tuning. Returns: embeddings (list): List of face embeddings. OR face_number (int): Returns number of faces if return_face_number is True and number of faces > 1. """ embeddings = [] if isinstance(img, str): # img is a path img = cv2.cvtColor(cv2.imread(img), cv2.COLOR_BGR2RGB) # faces = self.detector.detect_faces(img) # Compatible with the MTCNN from facenet_pytorch boxes, confidence, keypoints = self.detector.detect(Image.fromarray(img), landmarks=True) faces = [{ 'box': [box[0], box[1], box[2] - box[0], box[3] - box[1]], 'confidence': confidence, 'keypoints': { 'left_eye': tuple(keypoints[0]), 'right_eye': tuple(keypoints[1]), 'nose': tuple(keypoints[2]), 'mouth_left': tuple(keypoints[3]), 'mouth_right': tuple(keypoints[4]), }} for box, confidence, keypoints in zip(boxes, confidence, keypoints)] face_number = len(faces) if return_face_number and face_number != 1: # for tuning distance threshold return face_number # get biggest face from thumbnails if one_face and face_number > 1: height = [face['box'][3] for face in faces] # box: [x, y, w, h] index = height.index(max(height)) faces = [faces[index]] for face in faces: aligned_face = face_alignment(img, self.target, face['keypoints']) aligned_face = np.expand_dims(aligned_face, axis=0) embedding = self.encoder.predict(aligned_face)[0] embeddings.append(embedding) return embeddings