Source code for easycv.predictors.feature_extractor

# Copyright (c) Alibaba, Inc. and its affiliates.
import math
from glob import glob

import numpy as np
import torch
from PIL import Image

from easycv.framework.errors import ValueError
from easycv.thirdparty.face_align import glint360k_align
from easycv.thirdparty.mtcnn import FaceDetector
from .base import Predictor
from .builder import PREDICTORS

    from easy_vision.python.inference.predictor import PredictorInterface
    from .interface import PredictorInterface

[docs]@PREDICTORS.register_module() class TorchFeatureExtractor(PredictorInterface):
[docs] def __init__(self, model_path, model_config=None): """ init model Args: model_path: model file path model_config: config string for model to init, in json format """ self.predictor = Predictor(model_path) self.output_name = 'feature'
[docs] def get_output_type(self): """ in this function user should return a type dict, which indicates which type of data should the output of predictor be converted to * type json, data will be serialized to json str * type image, data will be converted to encode image binary and write to oss file, whose name is output_dir/${key}/${input_filename}_${idx}.jpg, where input_filename is the base filename extracted from url, key corresponds to the key in the dict of output_type, if the type of data indexed by key is a list, idx is the index of element in list, otherwhile ${idx} will be empty * type video, data will be converted to encode video binary and write to oss file, :: return { 'image': 'image', 'feature': 'json' } indicating that the image data in the output dict will be save to image file and feature in output dict will be converted to json """ return {}
[docs] def batch(self, image_tensor_list): return torch.stack(image_tensor_list)
[docs] def predict(self, input_data_list, batch_size=-1): """ using session run predict a number of samples using batch_size Args: input_data_list: a list of numpy array, each array is a sample to be predicted batch_size: batch_size passed by the caller, you can also ignore this param and use a fixed number if you do not want to adjust batch_size in runtime Return: result: a list of dict, each dict is the prediction result of one sample eg, {"output1": value1, "output2": value2}, the value type can be python int str float, and numpy array """ num_image = len(input_data_list) assert len( input_data_list) > 0, 'input images should not be an empty list' if batch_size > 0: num_batches = int(math.ceil(float(num_image) / batch_size)) image_list = input_data_list else: num_batches = 1 batch_size = len(input_data_list) image_list = input_data_list outputs_list = [] for batch_idx in range(num_batches): batch_image_list = image_list[batch_idx * batch_size:min( (batch_idx + 1) * batch_size, len(image_list))] image_tensor_list = self.predictor.preprocess(batch_image_list) input_data = self.batch(image_tensor_list) outputs = self.predictor.predict_batch( input_data, mode='extract')['neck'].data.cpu().numpy() for idx in range(len(image_tensor_list)): single_result = {} out = np.squeeze(outputs[idx]) single_result[self.output_name] = out outputs_list.append(single_result) return outputs_list
[docs]@PREDICTORS.register_module() class TorchFaceFeatureExtractor(PredictorInterface):
[docs] def __init__(self, model_path, model_config=None): """ init model, add a facedetect and align for img input. Args: model_path: model file path model_config: config string for model to init, in json format """ # Forward compatibility, to support both pth(only face feature model) or tar.gz(face feature model + mtcnn model) if model_path.endswith('.pth') or model_path.endswith('.pt'): self.predictor = Predictor(model_path) self.detector = FaceDetector() else: face_model = glob('%s/*.pth' % model_path) + glob( '%s/*.pt' % model_path) assert (len(face_model) == 1) self.predictor = Predictor(face_model[0]) mtcnn_weights = glob('%s/weights/*.npy' % model_path) if len(mtcnn_weights) != 3: print( "User provide model_path doesn't contain mtcnn models, we try to load weights from http, might failed!" ) self.detector = FaceDetector(dir_path=model_path) self.output_name = 'feature'
[docs] def get_output_type(self): """ in this function user should return a type dict, which indicates which type of data should the output of predictor be converted to * type json, data will be serialized to json str * type image, data will be converted to encode image binary and write to oss file, whose name is output_dir/${key}/${input_filename}_${idx}.jpg, where input_filename is the base filename extracted from url, key corresponds to the key in the dict of output_type, if the type of data indexed by key is a list, idx is the index of element in list, otherwhile ${idx} will be empty * type video, data will be converted to encode video binary and write to oss file, :: return { 'image': 'image', 'feature': 'json' } indicating that the image data in the output dict will be save to image file and feature in output dict will be converted to json """ return {}
[docs] def batch(self, image_tensor_list): return torch.stack(image_tensor_list)
[docs] def predict(self, input_data_list, batch_size=-1, detect_and_align=True): """ using session run predict a number of samples using batch_size Args: input_data_list: a list of numpy array or PIL.Image, each array is a sample to be predicted batch_size: batch_size passed by the caller, you can also ignore this param and use a fixed number if you do not want to adjust batch_size in runtime detect_and_align: True to detect and align before feature extractor Return: result: a list of dict, each dict is the prediction result of one sample eg, {"output1": value1, "output2": value2}, the value type can be python int str float, and numpy array Raise: if detect !=1 face in a img, then do nothing for this image """ num_image = len(input_data_list) assert len( input_data_list) > 0, 'input images should not be an empty list' if batch_size > 0: num_batches = int(math.ceil(float(num_image) / batch_size)) image_list = input_data_list else: num_batches = 1 batch_size = len(input_data_list) image_list = input_data_list for i in range(len(image_list)): if isinstance(image_list[i], np.ndarray): image_list[i] = Image.fromarray(image_list[i]) outputs_list = [] for batch_idx in range(num_batches): batch_image_list = image_list[batch_idx * batch_size:min( (batch_idx + 1) * batch_size, len(image_list))] if detect_and_align: for idx, img in enumerate(batch_image_list): bbox, ld = self.detector.safe_detect(img) if len(bbox) > 1: print('batch %d , %dth image has more then 1 face' % (batch_idx, idx)) batch_image_list[idx] = np.array( glint360k_align(img, ld[0])) elif len(bbox) == 0: print( 'batch %d , %dth image has no face detected, use original img' % (batch_idx, idx)) batch_image_list[idx] = np.array( img.resize((112, 112))) else: batch_image_list[idx] = np.array( glint360k_align(img, ld[0])) else: for idx, img in enumerate(batch_image_list): batch_image_list[idx] = np.array(img.resize((112, 112))) image_tensor_list = self.predictor.preprocess(batch_image_list) input_data = self.batch(image_tensor_list) outputs = self.predictor.predict_batch( input_data, mode='extract')['neck'].data.cpu().numpy() for idx in range(len(image_tensor_list)): single_result = {} out = np.squeeze(outputs[idx]) single_result[self.output_name] = out outputs_list.append(single_result) return outputs_list
[docs]@PREDICTORS.register_module() class TorchMultiFaceFeatureExtractor(PredictorInterface):
[docs] def __init__(self, model_path, model_config=None): """ init model, add a facedetect and align for img input. Args: model_path: model file path model_config: config string for model to init, in json format """ # Forward compatibility, to support both pth(only face feature model) or tar.gz(face feature model + mtcnn model) if model_path.endswith('.pth') or model_path.endswith('.pt'): self.predictor = Predictor(model_path) self.detector = FaceDetector() else: face_model = glob('%s/*.pth' % model_path) + glob( '%s/*.pt' % model_path) assert (len(face_model) == 1) self.predictor = Predictor(face_model[0]) mtcnn_weights = glob('%s/weights/*.npy' % model_path) if len(mtcnn_weights) != 3: print( "User provide model_path doesn't contain mtcnn models, we try to load weights from http, might failed!" ) self.detector = FaceDetector(dir_path=model_path) self.output_name = 'feature'
[docs] def get_output_type(self): """ in this function user should return a type dict, which indicates which type of data should the output of predictor be converted to * type json, data will be serialized to json str * type image, data will be converted to encode image binary and write to oss file, whose name is output_dir/${key}/${input_filename}_${idx}.jpg, where input_filename is the base filename extracted from url, key corresponds to the key in the dict of output_type, if the type of data indexed by key is a list, idx is the index of element in list, otherwhile ${idx} will be empty * type video, data will be converted to encode video binary and write to oss file, :: return { 'image': 'image', 'feature': 'json' } indicating that the image data in the output dict will be save to image file and feature in output dict will be converted to json """ return {}
[docs] def batch(self, image_tensor_list): return torch.stack(image_tensor_list)
[docs] def predict(self, input_data_list, batch_size=-1, detect_and_align=True): """ using session run predict a number of samples using batch_size Args: input_data_list: a list of numpy array or PIL.Image, each array is a sample to be predicted batch_size: batch_size passed by the caller, you can also ignore this param and use a fixed number if you do not want to adjust batch_size in runtime detect_and_align: True to detect and align before feature extractor Return: result: a list of dict, each dict is the prediction result of one sample eg, {"output1": value1, "output2": value2}, the value type can be python int str float, and numpy array Raise: if detect !=1 face in a img, then do nothing for this image """ num_image = len(input_data_list) outputs_list = [] for idx in range(num_image): img = input_data_list[idx] if isinstance(img, np.ndarray): img = Image.fromarray(img) sub_imgs = [] if detect_and_align: bboxes, ld = self.detector.safe_detect(img) if len(bboxes) > 1: print('%dth image has more then 1 face' % idx) for box, subld in zip(bboxes, ld): sub_imgs.append(np.array(glint360k_align(img, subld))) # sub_imgs.append(np.array(glint360k_align(img, ld[0]))) elif len(bboxes) == 0: print('%dth image has no face detected, use original img' % idx) sub_imgs.append(np.array(img.resize((112, 112)))) else: sub_imgs.append(np.array(glint360k_align(img, ld[0]))) else: sub_imgs.append(np.array(img.resize((112, 112)))) # x1,y1 x2,y2,score bboxes = [[0, 0, 111, 111, 1.0]] image_tensor_list = self.predictor.preprocess(sub_imgs) input_data = self.batch(image_tensor_list) outputs = self.predictor.predict_batch( input_data, mode='extract')['neck'].data.cpu().numpy() # for sub_idx in range(len(image_tensor_list)): single_result = {} if len(bboxes) == 0: bboxes = [[0, 0, 111, 111, 1.0]] # out = np.squeeze(outputs[0]) single_result[self.output_name] = outputs single_result['bbox'] = bboxes outputs_list.append(single_result) return outputs_list
[docs]@PREDICTORS.register_module() class TorchFaceAttrExtractor(PredictorInterface):
[docs] def __init__( self, model_path, model_config=None, face_threshold=0.95, attr_method=['distribute_sum', 'softmax', 'softmax'], attr_name=['age', 'gender', 'emo'], ): """ init model Args: model_path: model file path model_config: config string for model to init, in json format attr_method: - softmax: do softmax for feature_dim 1 - distribute_sum: do softmax and prob sum """ if model_path.endswith('.pth') or model_path.endswith('.pt'): self.predictor = Predictor(model_path) self.detector = FaceDetector() else: face_model = glob('%s/*.pth' % model_path) + glob( '%s/*.pt' % model_path) assert (len(face_model) == 1) self.predictor = Predictor(face_model[0]) mtcnn_weights = glob('%s/weights/*.npy' % model_path) if len(mtcnn_weights) != 3: print( "User provide model_path doesn't contain mtcnn models, we try to load weights from http, might failed!" ) self.detector = FaceDetector(dir_path=model_path) self.attr_name = attr_name self.attr_method = attr_method assert (len(self.attr_method) == len(self.attr_name)) self.gender_map = {0: 'female', 1: 'male'} self.emo_map = { 0: 'Neutral', 1: 'Happiness', 2: 'Sadness', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Anger', 7: 'Contempt', } self.pop_map = { 0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', } self.face_threshold = face_threshold
[docs] def get_output_type(self): """ in this function user should return a type dict, which indicates which type of data should the output of predictor be converted to * type json, data will be serialized to json str * type image, data will be converted to encode image binary and write to oss file, whose name is output_dir/${key}/${input_filename}_${idx}.jpg, where input_filename is the base filename extracted from url, key corresponds to the key in the dict of output_type, if the type of data indexed by key is a list, idx is the index of element in list, otherwhile ${idx} will be empty * type video, data will be converted to encode video binary and write to oss file, :: return { 'image': 'image', 'feature': 'json' } indicating that the image data in the output dict will be save to image file and feature in output dict will be converted to json """ return {}
[docs] def batch(self, image_tensor_list): return torch.stack(image_tensor_list)
[docs] def predict(self, input_data_list, batch_size=-1): """ using session run predict a number of samples using batch_size Args: input_data_list: a list of numpy array, each array is a sample to be predicted batch_size: batch_size passed by the caller, you can also ignore this param and use a fixed number if you do not want to adjust batch_size in runtime Return: result: a list of dict, each dict is the prediction result of one sample eg, {"output1": value1, "output2": value2}, the value type can be python int str float, and numpy array """ num_image = len(input_data_list) assert len( input_data_list) > 0, 'input images should not be an empty list' if batch_size > 0: num_batches = int(math.ceil(float(num_image) / batch_size)) image_list = input_data_list else: num_batches = 1 batch_size = len(input_data_list) image_list = input_data_list outputs_list = [] for batch_idx in range(num_batches): batch_image_list = image_list[ batch_idx * batch_size:min(len(image_list), (batch_idx + 1) * batch_size)] face_image_list = [] face_bbox_list = [] faceidx_by_imageidx = {} for idx, img in enumerate(batch_image_list): # this try except only happens to no face detected bbox, ld = self.detector.safe_detect(img) if len(bbox) == 0: print('batch %d , %dth image has no face detected' % (batch_idx, idx)) elif len(bbox) >= 1: if len(bbox) > 1: print('batch %d , %dth image has more then %d face' % (batch_idx, idx, len(bbox))) _bbox = [] _ld = [] for idx, b in enumerate(bbox): if b[-1] > self.face_threshold: _bbox.append(b) _ld.append(ld[idx]) bbox = _bbox ld = _ld # this is for muti face detectd in one img faceidx_by_imageidx[idx] = [] for bbox_idx, face_box in enumerate(bbox): face_image_list.append( glint360k_align(img, ld[bbox_idx])) face_bbox_list.append(face_box) face_idx = len(face_image_list) - 1 faceidx_by_imageidx[idx].append(face_idx) # else: # batch_image_list[idx] = np.array(glint360k_align(img, ld[0])) if len(face_image_list) > 0: image_tensor_list = self.predictor.preprocess(face_image_list) input_data = self.batch(image_tensor_list) outputs = self.predictor.predict_batch( input_data, mode='extract') neck_output_dict = {} for neck_idx, attr_method in enumerate(self.attr_method): neck_output = outputs['neck_%d_0' % neck_idx] neck_output = torch.nn.Softmax(dim=1)(neck_output) if attr_method == 'softmax': neck_output = torch.argmax(neck_output, dim=1) elif attr_method == 'distribute_sum': n, c = neck_output.size() distribute = torch.arange(0, c).repeat(n, 1).to( neck_output.device) neck_output = (distribute * neck_output).sum(dim=1) else: raise ValueError( 'TorchFaceAttrExtractor for neck %d only support attr_method softmax/distributed sum' % (neck_idx)) neck_output = torch.argmax(neck_output, dim=1) neck_output_dict[neck_idx] = neck_output.cpu().numpy() for imgidx in faceidx_by_imageidx.keys(): single_result = {} for k in neck_output_dict.keys(): single_result['face_' + self.attr_name[k]] = [] single_result['face_bbox'] = [] for fn, faceidx in enumerate(faceidx_by_imageidx[imgidx]): for k in neck_output_dict.keys(): out = np.squeeze(neck_output_dict[k][faceidx]) if self.attr_method[k] == 'softmax': label_map = getattr( self, '%s_map' % self.attr_name[k]) out = label_map[out] single_result['face_' + self.attr_name[k]].append(out) single_result['face_bbox'].append( face_bbox_list[faceidx]) outputs_list.append(single_result) return outputs_list