first_order_predictor.py 16.5 KB
Newer Older
L
LielinJiang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import os
import sys
17 18
import cv2
import math
L
LielinJiang 已提交
19 20 21 22 23 24 25 26 27

import yaml
import pickle
import imageio
import numpy as np
from tqdm import tqdm
from scipy.spatial import ConvexHull

import paddle
L
LielinJiang 已提交
28
from ppgan.utils.download import get_path_from_url
L
LielinJiang 已提交
29 30 31
from ppgan.utils.animate import normalize_kp
from ppgan.modules.keypoint_detector import KPDetector
from ppgan.models.generators.occlusion_aware import OcclusionAwareGenerator
32
from ppgan.faceutils import face_detection
L
LielinJiang 已提交
33 34 35 36 37

from .base_predictor import BasePredictor


class FirstOrderPredictor(BasePredictor):
H
houj04 已提交
38

L
LielinJiang 已提交
39 40 41 42 43 44 45
    def __init__(self,
                 output='output',
                 weight_path=None,
                 config=None,
                 relative=False,
                 adapt_scale=False,
                 find_best_frame=False,
46
                 best_frame=None,
47
                 ratio=1.0,
L
lijianshe02 已提交
48
                 filename='result.mp4',
F
FNRE 已提交
49
                 face_detector='sfd',
50
                 multi_person=False,
51 52
                 image_size=256,
                 face_enhancement=False,
L
lzzyzlbb 已提交
53
                 batch_size=1,
H
houj04 已提交
54 55
                 mobile_net=False,
                 slice_size=0):
L
LielinJiang 已提交
56
        if config is not None and isinstance(config, str):
F
FNRE 已提交
57 58
            with open(config) as f:
                self.cfg = yaml.load(f, Loader=yaml.SafeLoader)
L
LielinJiang 已提交
59 60 61 62
        elif isinstance(config, dict):
            self.cfg = config
        elif config is None:
            self.cfg = {
F
FNRE 已提交
63
                'model': {
L
LielinJiang 已提交
64 65 66 67 68
                    'common_params': {
                        'num_kp': 10,
                        'num_channels': 3,
                        'estimate_jacobian': True
                    },
F
FNRE 已提交
69 70 71 72
                    'generator': {
                        'kp_detector_cfg': {
                            'temperature': 0.1,
                            'block_expansion': 32,
L
LielinJiang 已提交
73
                            'max_features': 1024,
F
FNRE 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
                            'scale_factor': 0.25,
                            'num_blocks': 5
                        },
                        'generator_cfg': {
                            'block_expansion': 64,
                            'max_features': 512,
                            'num_down_blocks': 2,
                            'num_bottleneck_blocks': 6,
                            'estimate_occlusion_map': True,
                            'dense_motion_params': {
                                'block_expansion': 64,
                                'max_features': 1024,
                                'num_blocks': 5,
                                'scale_factor': 0.25
                            }
L
LielinJiang 已提交
89 90 91 92
                        }
                    }
                }
            }
L
lzzyzlbb 已提交
93 94 95
        self.image_size = image_size
        if weight_path is None:
            if mobile_net:
96
                vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-mobile.pdparams'
H
houj04 已提交
97

L
lzzyzlbb 已提交
98
            else:
99 100 101 102
                if self.image_size == 512:
                    vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk-512.pdparams'
                else:
                    vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk.pdparams'
L
lzzyzlbb 已提交
103
            weight_path = get_path_from_url(vox_cpk_weight_url)
L
LielinJiang 已提交
104 105

        self.weight_path = weight_path
106 107
        if not os.path.exists(output):
            os.makedirs(output)
L
LielinJiang 已提交
108
        self.output = output
109
        self.filename = filename
L
LielinJiang 已提交
110 111 112 113
        self.relative = relative
        self.adapt_scale = adapt_scale
        self.find_best_frame = find_best_frame
        self.best_frame = best_frame
114
        self.ratio = ratio
L
lijianshe02 已提交
115
        self.face_detector = face_detector
L
LielinJiang 已提交
116 117
        self.generator, self.kp_detector = self.load_checkpoints(
            self.cfg, self.weight_path)
F
FNRE 已提交
118
        self.multi_person = multi_person
119 120 121 122 123
        self.face_enhancement = face_enhancement
        self.batch_size = batch_size
        if face_enhancement:
            from ppgan.faceutils.face_enhancement import FaceEnhancement
            self.faceenhancer = FaceEnhancement(batch_size=batch_size)
H
houj04 已提交
124
        self.slice_size = slice_size
L
LielinJiang 已提交
125

F
FNRE 已提交
126 127 128 129 130 131
    def read_img(self, path):
        img = imageio.imread(path)
        if img.ndim == 2:
            img = np.expand_dims(img, axis=2)
        # som images have 4 channels
        if img.shape[2] > 3:
H
houj04 已提交
132
            img = img[:, :, :3]
F
FNRE 已提交
133 134
        return img

L
LielinJiang 已提交
135
    def run(self, source_image, driving_video):
H
houj04 已提交
136

F
FNRE 已提交
137
        def get_prediction(face_image):
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
            if self.find_best_frame or self.best_frame is not None:
                i = self.best_frame if self.best_frame is not None else self.find_best_frame_func(
                    source_image, driving_video)

                print("Best frame: " + str(i))
                driving_forward = driving_video[i:]
                driving_backward = driving_video[:(i + 1)][::-1]
                predictions_forward = self.make_animation(
                    face_image,
                    driving_forward,
                    self.generator,
                    self.kp_detector,
                    relative=self.relative,
                    adapt_movement_scale=self.adapt_scale)
                predictions_backward = self.make_animation(
                    face_image,
                    driving_backward,
                    self.generator,
                    self.kp_detector,
                    relative=self.relative,
                    adapt_movement_scale=self.adapt_scale)
                predictions = predictions_backward[::-1] + predictions_forward[
                    1:]
            else:
                predictions = self.make_animation(
                    face_image,
                    driving_video,
                    self.generator,
                    self.kp_detector,
                    relative=self.relative,
                    adapt_movement_scale=self.adapt_scale)
F
FNRE 已提交
169
            return predictions
170

F
FNRE 已提交
171
        source_image = self.read_img(source_image)
F
FNRE 已提交
172 173 174 175 176 177 178
        reader = imageio.get_reader(driving_video)
        fps = reader.get_meta_data()['fps']
        driving_video = []
        try:
            for im in reader:
                driving_video.append(im)
        except RuntimeError:
F
FNRE 已提交
179
            print("Read driving video error!")
F
FNRE 已提交
180 181 182 183
            pass
        reader.close()

        driving_video = [
H
houj04 已提交
184 185
            cv2.resize(frame, (self.image_size, self.image_size)) / 255.0
            for frame in driving_video
F
FNRE 已提交
186 187 188 189 190 191 192 193 194
        ]
        results = []

        bboxes = self.extract_bbox(source_image.copy())
        print(str(len(bboxes)) + " persons have been detected")

        # for multi person
        for rec in bboxes:
            face_image = source_image.copy()[rec[1]:rec[3], rec[0]:rec[2]]
H
houj04 已提交
195 196
            face_image = cv2.resize(face_image,
                                    (self.image_size, self.image_size)) / 255.0
F
FNRE 已提交
197
            predictions = get_prediction(face_image)
H
houj04 已提交
198 199 200 201 202 203
            results.append({
                'rec':
                rec,
                'predict':
                [predictions[i] for i in range(predictions.shape[0])]
            })
F
FNRE 已提交
204
            if len(bboxes) == 1 or not self.multi_person:
H
houj04 已提交
205
                break
206
        out_frame = []
207

208 209 210
        for i in range(len(driving_video)):
            frame = source_image.copy()
            for result in results:
F
FNRE 已提交
211
                x1, y1, x2, y2, _ = result['rec']
212 213
                h = y2 - y1
                w = x2 - x1
214
                out = result['predict'][i]
215
                out = cv2.resize(out.astype(np.uint8), (x2 - x1, y2 - y1))
216 217
                if len(results) == 1:
                    frame[y1:y2, x1:x2] = out
F
FNRE 已提交
218
                    break
219 220 221 222 223 224 225 226 227
                else:
                    patch = np.zeros(frame.shape).astype('uint8')
                    patch[y1:y2, x1:x2] = out
                    mask = np.zeros(frame.shape[:2]).astype('uint8')
                    cx = int((x1 + x2) / 2)
                    cy = int((y1 + y2) / 2)
                    cv2.circle(mask, (cx, cy), math.ceil(h * self.ratio),
                               (255, 255, 255), -1, 8, 0)
                    frame = cv2.copyTo(patch, mask, frame)
228 229

            out_frame.append(frame)
230
        imageio.mimsave(os.path.join(self.output, self.filename),
231 232
                        [frame for frame in out_frame],
                        fps=fps)
L
LielinJiang 已提交
233 234 235

    def load_checkpoints(self, config, checkpoint_path):

H
houj04 已提交
236 237 238 239
        generator = OcclusionAwareGenerator(**config['model']['generator']
                                            ['generator_cfg'],
                                            **config['model']['common_params'],
                                            inference=True)
L
LielinJiang 已提交
240

F
FNRE 已提交
241 242 243
        kp_detector = KPDetector(
            **config['model']['generator']['kp_detector_cfg'],
            **config['model']['common_params'])
L
LielinJiang 已提交
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266

        checkpoint = paddle.load(self.weight_path)
        generator.set_state_dict(checkpoint['generator'])

        kp_detector.set_state_dict(checkpoint['kp_detector'])

        generator.eval()
        kp_detector.eval()

        return generator, kp_detector

    def make_animation(self,
                       source_image,
                       driving_video,
                       generator,
                       kp_detector,
                       relative=True,
                       adapt_movement_scale=True):
        with paddle.no_grad():
            predictions = []
            source = paddle.to_tensor(source_image[np.newaxis].astype(
                np.float32)).transpose([0, 3, 1, 2])

H
houj04 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
            driving_video_np = np.array(driving_video).astype(np.float32)
            driving_n, driving_h, driving_w, driving_c = driving_video_np.shape

            driving_slices = []

            if self.slice_size != 0:
                batch_count_in_slice = int(
                    np.floor(
                        float(self.slice_size) /
                        (self.batch_size * driving_h * driving_w * driving_c)))
                assert batch_count_in_slice > 0, "batch_count_in_slice is 0, use smaller batch_size or bigger slice_size"
                frame_count_in_slice = batch_count_in_slice * self.batch_size
                for slice_start in range(0, driving_n, frame_count_in_slice):
                    slice_end = slice_start + min(frame_count_in_slice,
                                                  driving_n - slice_start)
                    current_slice = paddle.to_tensor(
                        driving_video_np[slice_start:slice_end, ]).transpose(
                            [0, 3, 1, 2])
                    driving_slices.append(current_slice)
            else:
                # whole driving as a single slice
                driving = paddle.to_tensor(
                    np.array(driving_video).astype(np.float32)).transpose(
                        [0, 3, 1, 2])
                frame_count_in_slice = driving_n
                driving_slices.append(driving)

L
LielinJiang 已提交
294
            kp_source = kp_detector(source)
H
houj04 已提交
295
            kp_driving_initial = kp_detector(driving_slices[0][0:1])
296
            kp_source_batch = {}
H
houj04 已提交
297 298 299 300 301 302
            kp_source_batch["value"] = paddle.tile(
                kp_source["value"], repeat_times=[self.batch_size, 1, 1])
            kp_source_batch["jacobian"] = paddle.tile(
                kp_source["jacobian"], repeat_times=[self.batch_size, 1, 1, 1])
            source = paddle.tile(source,
                                 repeat_times=[self.batch_size, 1, 1, 1])
303
            begin_idx = 0
H
houj04 已提交
304 305 306 307 308 309 310 311 312 313 314 315
            for frame_idx in tqdm(
                    range(int(np.ceil(float(driving_n) / self.batch_size)))):
                frame_num = min(self.batch_size, driving_n - begin_idx)
                slice_id = int(frame_idx * self.batch_size /
                               frame_count_in_slice)

                internal_start = frame_idx - slice_id * frame_count_in_slice
                internal_end = frame_idx - slice_id * frame_count_in_slice + frame_num

                driving_frame = driving_slices[slice_id][
                    internal_start:internal_end]

L
LielinJiang 已提交
316
                kp_driving = kp_detector(driving_frame)
317 318
                kp_source_img = {}
                kp_source_img["value"] = kp_source_batch["value"][0:frame_num]
H
houj04 已提交
319 320 321
                kp_source_img["jacobian"] = kp_source_batch["jacobian"][
                    0:frame_num]

L
LielinJiang 已提交
322 323 324 325 326 327 328
                kp_norm = normalize_kp(
                    kp_source=kp_source,
                    kp_driving=kp_driving,
                    kp_driving_initial=kp_driving_initial,
                    use_relative_movement=relative,
                    use_relative_jacobian=relative,
                    adapt_movement_scale=adapt_movement_scale)
H
houj04 已提交
329 330 331 332 333 334 335

                out = generator(source[0:frame_num],
                                kp_source=kp_source_img,
                                kp_driving=kp_norm)
                img = np.transpose(out['prediction'].numpy(),
                                   [0, 2, 3, 1]) * 255.0

336 337
                if self.face_enhancement:
                    img = self.faceenhancer.enhance_from_batch(img)
L
LielinJiang 已提交
338

339 340 341
                predictions.append(img)
                begin_idx += frame_num
        return np.concatenate(predictions)
L
LielinJiang 已提交
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367

    def find_best_frame_func(self, source, driving):
        import face_alignment

        def normalize_kp(kp):
            kp = kp - kp.mean(axis=0, keepdims=True)
            area = ConvexHull(kp[:, :2]).volume
            area = np.sqrt(area)
            kp[:, :2] = kp[:, :2] / area
            return kp

        fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,
                                          flip_input=True)

        kp_source = fa.get_landmarks(255 * source)[0]
        kp_source = normalize_kp(kp_source)
        norm = float('inf')
        frame_num = 0
        for i, image in tqdm(enumerate(driving)):
            kp_driving = fa.get_landmarks(255 * image)[0]
            kp_driving = normalize_kp(kp_driving)
            new_norm = (np.abs(kp_source - kp_driving)**2).sum()
            if new_norm < norm:
                norm = new_norm
                frame_num = i
        return frame_num
368 369 370

    def extract_bbox(self, image):
        detector = face_detection.FaceAlignment(
L
lijianshe02 已提交
371 372 373
            face_detection.LandmarksType._2D,
            flip_input=False,
            face_detector=self.face_detector)
374 375 376

        frame = [image]
        predictions = detector.get_detections_for_image(np.array(frame))
F
FNRE 已提交
377 378 379
        person_num = len(predictions)
        if person_num == 0:
            return np.array([])
380
        results = []
F
FNRE 已提交
381
        face_boxs = []
382 383 384 385 386 387 388 389
        h, w, _ = image.shape
        for rect in predictions:
            bh = rect[3] - rect[1]
            bw = rect[2] - rect[0]
            cy = rect[1] + int(bh / 2)
            cx = rect[0] + int(bw / 2)
            margin = max(bh, bw)
            y1 = max(0, cy - margin)
390
            x1 = max(0, cx - int(0.8 * margin))
391
            y2 = min(h, cy + margin)
392
            x2 = min(w, cx + int(0.8 * margin))
F
FNRE 已提交
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
            area = (y2 - y1) * (x2 - x1)
            results.append([x1, y1, x2, y2, area])
        # if a person has more than one bbox, keep the largest one
        # maybe greedy will be better?
        sorted(results, key=lambda area: area[4], reverse=True)
        results_box = [results[0]]
        for i in range(1, person_num):
            num = len(results_box)
            add_person = True
            for j in range(num):
                pre_person = results_box[j]
                iou = self.IOU(pre_person[0], pre_person[1], pre_person[2],
                               pre_person[3], pre_person[4], results[i][0],
                               results[i][1], results[i][2], results[i][3],
                               results[i][4])
                if iou > 0.5:
                    add_person = False
                    break
            if add_person:
                results_box.append(results[i])
        boxes = np.array(results_box)
414
        return boxes
F
FNRE 已提交
415 416 417 418 419 420 421 422 423 424 425 426

    def IOU(self, ax1, ay1, ax2, ay2, sa, bx1, by1, bx2, by2, sb):
        #sa = abs((ax2 - ax1) * (ay2 - ay1))
        #sb = abs((bx2 - bx1) * (by2 - by1))
        x1, y1 = max(ax1, bx1), max(ay1, by1)
        x2, y2 = min(ax2, bx2), min(ay2, by2)
        w = x2 - x1
        h = y2 - y1
        if w < 0 or h < 0:
            return 0.0
        else:
            return 1.0 * w * h / (sa + sb - w * h)