From d913d8d79891c2859b356847e2d442efb7f1a6c4 Mon Sep 17 00:00:00 2001
From: lzzyzlbb <287246233@qq.com>
Date: Wed, 21 Jul 2021 18:08:57 +0800
Subject: [PATCH] 1.add face enhancement. 2.fix edge problem (#367)

* add stargan pretrain model

* 1.add face enhancement. 2.fix edge problem

* 1.add face enhancement. 2.fix edge problem

* 1.add face enhancement. 2.fix edge problem

* 1.add face enhancement. 2.fix edge problem

* 1.add face enhancement. 2.fix edge problem

* 1.add face enhancement. 2.fix edge problem
---
 applications/tools/first-order-demo.py        | 15 +++-
 applications/tools/wav2lip.py                 |  9 +-
 docs/en_US/tutorials/motion_driving.md        | 12 ++-
 docs/en_US/tutorials/wav2lip.md               |  8 +-
 docs/zh_CN/tutorials/motion_driving.md        | 12 ++-
 docs/zh_CN/tutorials/wav2lip.md               |  8 +-
 ppgan/apps/first_order_predictor.py           | 51 +++++++----
 ppgan/apps/wav2lip_predictor.py               |  9 +-
 ppgan/faceutils/face_enhancement/__init__.py  | 15 ++++
 .../face_enhancement/face_enhance.py          | 78 ++++++++++++++++
 ppgan/models/generators/__init__.py           |  1 +
 .../models/generators/generator_styleganv2.py | 64 +++++++++-----
 ppgan/models/generators/gpen.py               | 88 +++++++++++++++++++
 ppgan/models/generators/occlusion_aware.py    | 25 +++++-
 14 files changed, 350 insertions(+), 45 deletions(-)
 create mode 100644 ppgan/faceutils/face_enhancement/__init__.py
 create mode 100644 ppgan/faceutils/face_enhancement/face_enhance.py
 create mode 100644 ppgan/models/generators/gpen.py

diff --git a/applications/tools/first-order-demo.py b/applications/tools/first-order-demo.py
index cfcadbe..588e9c3 100644
--- a/applications/tools/first-order-demo.py
+++ b/applications/tools/first-order-demo.py
@@ -73,8 +73,19 @@ parser.add_argument("--image_size",
                     type=int,
                     default=256,
                     help="size of image")
+parser.add_argument("--batch_size",
+                    dest="batch_size",
+                    type=int,
+                    default=1,
+                    help="Batch size for fom model")
+parser.add_argument(
+    "--face_enhancement",
+    dest="face_enhancement",
+    action="store_true",
+    help="use face enhance for face")
 parser.set_defaults(relative=False)
 parser.set_defaults(adapt_scale=False)
+parser.set_defaults(face_enhancement=False)
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -92,5 +103,7 @@ if __name__ == "__main__":
                                     ratio=args.ratio,
                                     face_detector=args.face_detector,
                                     multi_person=args.multi_person,
-                                    image_size=args.image_size)
+                                    image_size=args.image_size,
+                                    batch_size=args.batch_size,
+                                    face_enhancement=args.face_enhancement)
     predictor.run(args.source_image, args.driving_video)
diff --git a/applications/tools/wav2lip.py b/applications/tools/wav2lip.py
index 5fd472e..ac655a6 100644
--- a/applications/tools/wav2lip.py
+++ b/applications/tools/wav2lip.py
@@ -103,6 +103,12 @@ parser.add_argument(
     type=str,
     default='sfd',
     help="face detector to be used, can choose s3fd or blazeface")
+parser.add_argument(
+    "--face_enhancement",
+    dest="face_enhancement",
+    action="store_true",
+    help="use face enhance for face")
+parser.set_defaults(face_enhancement=False)
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -120,5 +126,6 @@ if __name__ == "__main__":
                                  box = args.box,
                                  rotate = args.rotate,
                                  nosmooth = args.nosmooth,
-                                 face_detector = args.face_detector)
+                                 face_detector = args.face_detector,
+                                 face_enhancement = args.face_enhancement)
     predictor.run(args.face, args.audio, args.outfile)
diff --git a/docs/en_US/tutorials/motion_driving.md b/docs/en_US/tutorials/motion_driving.md
index ae375ce..5d4e38d 100644
--- a/docs/en_US/tutorials/motion_driving.md
+++ b/docs/en_US/tutorials/motion_driving.md
@@ -33,7 +33,8 @@ python -u tools/first-order-demo.py  \
      --source_image ../docs/imgs/fom_source_image.png \
      --ratio 0.4 \
      --relative --adapt_scale \
-     --image_size 512
+     --image_size 512 \
+     --face_enhancement
 ```
 
 - multi face：
@@ -56,7 +57,16 @@ python -u tools/first-order-demo.py  \
 - ratio: The pasted face percentage of generated image, this parameter should be adjusted in the case of multi-person image in which the adjacent faces are close. The defualt value is 0.4 and the range is [0.4, 0.5].
 - image_size: The image size of the face. Default is 256
 - multi_person: There are multi faces in the images. Default means only one face in the image
+- face_enhancement: enhance the face, default is False
 ```
+result of face_enhancement:
+<div align='center'>
+  <img src='https://user-images.githubusercontent.com/17897185/126444836-b68593e3-ae43-4450-b18f-1a549230bf07.gif' width='700'/>
+</div>
+<div align='center'>
+  <img src='https://user-images.githubusercontent.com/17897185/126444194-436cc885-259d-4636-ad4c-c3dcc52fe175.gif' width='700'/>
+</div>
+
 
 ### 2 Training
 **Datasets:**
diff --git a/docs/en_US/tutorials/wav2lip.md b/docs/en_US/tutorials/wav2lip.md
index 7c7ffbf..4411b54 100644
--- a/docs/en_US/tutorials/wav2lip.md
+++ b/docs/en_US/tutorials/wav2lip.md
@@ -11,13 +11,19 @@ Runing the following command to complete the lip-syning task. The output is the
 
 ```
 cd applications
-python tools/wav2lip.py --face ../docs/imgs/mona7s.mp4 --audio ../docs/imgs/guangquan.m4a --outfile pp_guangquan_mona7s.mp4
+python tools/wav2lip.py \
+    --face ../docs/imgs/mona7s.mp4 \
+    --audio ../docs/imgs/guangquan.m4a \
+    --outfile pp_guangquan_mona7s.mp4 \
+    --face_enhancement
 ```
 
 **params:**
 
 - face: path of the input image or video file including faces.
 - audio: path of the input audio file, format can be `.wav`， `.mp3`, `.m4a`. It can be any file supported by `FFMPEG` containing audio data.
+- outfile: result video of wav2lip
+- face_enhancement: enhance the face, default is False
 
 ### 2.2 Training
 1. Our model are trained on LRS2. See [here](https://github.com/Rudrabha/Wav2Lip#training-on-datasets-other-than-lrs2) for a few suggestions regarding training on other datasets.
diff --git a/docs/zh_CN/tutorials/motion_driving.md b/docs/zh_CN/tutorials/motion_driving.md
index af2819b..0dadb1e 100644
--- a/docs/zh_CN/tutorials/motion_driving.md
+++ b/docs/zh_CN/tutorials/motion_driving.md
@@ -40,7 +40,8 @@ python -u tools/first-order-demo.py  \
      --source_image ../docs/imgs/fom_source_image.png \
      --ratio 0.4 \
      --relative --adapt_scale \
-     --image_size 512
+     --image_size 512 \
+     --face_enhancement
 ```
 - 多人脸：
 ```
@@ -60,7 +61,16 @@ python -u tools/first-order-demo.py  \
 - ratio: 贴回驱动生成的人脸区域占原图的比例, 用户需要根据生成的效果调整该参数，尤其对于多人脸距离比较近的情况下需要调整改参数, 默认为0.4，调整范围是[0.4, 0.5]
 - image_size: 图片人脸大小，默认为256
 - multi_person: 表示图片中有多张人脸，不加则默认为单人脸
+- face_enhancement: 添加人脸增强，默认为false
 ```
+添加人脸增强对比如下：
+<div align='center'>
+  <img src='https://user-images.githubusercontent.com/17897185/126444836-b68593e3-ae43-4450-b18f-1a549230bf07.gif' width='700'/>
+</div>
+<div align='center'>
+  <img src='https://user-images.githubusercontent.com/17897185/126444194-436cc885-259d-4636-ad4c-c3dcc52fe175.gif' width='700'/> 
+</div>
+
 
 ### 2 训练
 **数据集:**
diff --git a/docs/zh_CN/tutorials/wav2lip.md b/docs/zh_CN/tutorials/wav2lip.md
index d2d2f6d..900a12b 100644
--- a/docs/zh_CN/tutorials/wav2lip.md
+++ b/docs/zh_CN/tutorials/wav2lip.md
@@ -13,11 +13,17 @@ Wav2Lip实现的是视频人物根据输入音频生成与语音同步的人物
 
 ```
 cd applications
-python tools/wav2lip.py --face ../docs/imgs/mona7s.mp4 --audio ../docs/imgs/guangquan.m4a --outfile pp_guangquan_mona7s.mp4
+python tools/wav2lip.py \ 
+    --face ../docs/imgs/mona7s.mp4 \
+    --audio ../docs/imgs/guangquan.m4a \
+    --outfile pp_guangquan_mona7s.mp4
+    --face_enhancement
 ```
 **参数说明:**
 - face: 视频或图片，视频或图片中的人物唇形将根据音频进行唇形合成，以和音频同步
 - audio: 驱动唇形合成的音频，视频中的人物将根据此音频进行唇形合成
+- outfile: 合成的视频
+- face_enhancement: 添加人脸增强，默认为false 
 
 ### 2.2 训练
 1. 我们的模型是基于LRS2数据集训练的。可以参考[这里](https://github.com/Rudrabha/Wav2Lip#training-on-datasets-other-than-lrs2)获得在其它训练集上进行训练的一些建议。
diff --git a/ppgan/apps/first_order_predictor.py b/ppgan/apps/first_order_predictor.py
index 876fa58..a225d20 100644
--- a/ppgan/apps/first_order_predictor.py
+++ b/ppgan/apps/first_order_predictor.py
@@ -47,7 +47,9 @@ class FirstOrderPredictor(BasePredictor):
                  filename='result.mp4',
                  face_detector='sfd',
                  multi_person=False,
-                 image_size=256):
+                 image_size=256,
+                 face_enhancement=False,
+                 batch_size=1):
         if config is not None and isinstance(config, str):
             with open(config) as f:
                 self.cfg = yaml.load(f, Loader=yaml.SafeLoader)
@@ -107,6 +109,11 @@ class FirstOrderPredictor(BasePredictor):
         self.generator, self.kp_detector = self.load_checkpoints(
             self.cfg, self.weight_path)
         self.multi_person = multi_person
+        self.face_enhancement = face_enhancement
+        self.batch_size = batch_size
+        if face_enhancement:
+            from ppgan.faceutils.face_enhancement import FaceEnhancement
+            self.faceenhancer = FaceEnhancement(batch_size=batch_size)
 
     def read_img(self, path):
         img = imageio.imread(path)
@@ -177,7 +184,7 @@ class FirstOrderPredictor(BasePredictor):
             face_image = source_image.copy()[rec[1]:rec[3], rec[0]:rec[2]]
             face_image = cv2.resize(face_image, (self.image_size, self.image_size)) / 255.0
             predictions = get_prediction(face_image)
-            results.append({'rec': rec, 'predict': predictions})
+            results.append({'rec': rec, 'predict': [predictions[i] for i in range(predictions.shape[0])]})
             if len(bboxes) == 1 or not self.multi_person:
                 break 
         out_frame = []
@@ -188,7 +195,7 @@ class FirstOrderPredictor(BasePredictor):
                 x1, y1, x2, y2, _ = result['rec']
                 h = y2 - y1
                 w = x2 - x1
-                out = result['predict'][i] * 255.0
+                out = result['predict'][i]
                 out = cv2.resize(out.astype(np.uint8), (x2 - x1, y2 - y1))
                 if len(results) == 1:
                     frame[y1:y2, x1:x2] = out
@@ -212,7 +219,7 @@ class FirstOrderPredictor(BasePredictor):
 
         generator = OcclusionAwareGenerator(
             **config['model']['generator']['generator_cfg'],
-            **config['model']['common_params'])
+            **config['model']['common_params'], inference=True)
 
         kp_detector = KPDetector(
             **config['model']['generator']['kp_detector_cfg'],
@@ -241,14 +248,23 @@ class FirstOrderPredictor(BasePredictor):
                 np.float32)).transpose([0, 3, 1, 2])
 
             driving = paddle.to_tensor(
-                np.array(driving_video)[np.newaxis].astype(
-                    np.float32)).transpose([0, 4, 1, 2, 3])
+                np.array(driving_video).astype(
+                    np.float32)).transpose([0, 3, 1, 2])
             kp_source = kp_detector(source)
-            kp_driving_initial = kp_detector(driving[:, :, 0])
-
-            for frame_idx in tqdm(range(driving.shape[2])):
-                driving_frame = driving[:, :, frame_idx]
+            kp_driving_initial = kp_detector(driving[0:1])
+            kp_source_batch = {}
+            kp_source_batch["value"] = paddle.tile(kp_source["value"], repeat_times=[self.batch_size,1,1])
+            kp_source_batch["jacobian"] = paddle.tile(kp_source["jacobian"], repeat_times=[self.batch_size,1,1,1])
+            source = paddle.tile(source, repeat_times=[self.batch_size,1,1,1])
+            begin_idx = 0
+            for frame_idx in tqdm(range(int(np.ceil(float(driving.shape[0]) / self.batch_size)))):
+                frame_num = min(self.batch_size, driving.shape[0] - begin_idx)
+                driving_frame = driving[begin_idx: begin_idx+frame_num]
                 kp_driving = kp_detector(driving_frame)
+                kp_source_img = {}
+                kp_source_img["value"] = kp_source_batch["value"][0:frame_num]
+                kp_source_img["jacobian"] = kp_source_batch["jacobian"][0:frame_num]
+                
                 kp_norm = normalize_kp(
                     kp_source=kp_source,
                     kp_driving=kp_driving,
@@ -256,11 +272,16 @@ class FirstOrderPredictor(BasePredictor):
                     use_relative_movement=relative,
                     use_relative_jacobian=relative,
                     adapt_movement_scale=adapt_movement_scale)
-                out = generator(source, kp_source=kp_source, kp_driving=kp_norm)
-
-                predictions.append(
-                    np.transpose(out['prediction'].numpy(), [0, 2, 3, 1])[0])
-        return predictions
+                
+                out = generator(source[0:frame_num], kp_source=kp_source_img, kp_driving=kp_norm)
+                img = np.transpose(out['prediction'].numpy(), [0, 2, 3, 1]) * 255.0 
+                
+                if self.face_enhancement:
+                    img = self.faceenhancer.enhance_from_batch(img)
+
+                predictions.append(img)
+                begin_idx += frame_num
+        return np.concatenate(predictions)
 
     def find_best_frame_func(self, source, driving):
         import face_alignment
diff --git a/ppgan/apps/wav2lip_predictor.py b/ppgan/apps/wav2lip_predictor.py
index 26a488c..152eedc 100644
--- a/ppgan/apps/wav2lip_predictor.py
+++ b/ppgan/apps/wav2lip_predictor.py
@@ -28,7 +28,8 @@ class Wav2LipPredictor(BasePredictor):
                  box = [-1, -1, -1, -1],
                  rotate = False,
                  nosmooth = False,
-                 face_detector = 'sfd'):
+                 face_detector = 'sfd',
+                 face_enhancement = False):
         self.img_size = 96
         self.checkpoint_path = checkpoint_path
         self.static = static
@@ -42,6 +43,10 @@ class Wav2LipPredictor(BasePredictor):
         self.rotate = rotate
         self.nosmooth = nosmooth
         self.face_detector = face_detector
+        self.face_enhancement = face_enhancement
+        if face_enhancement:
+            from ppgan.faceutils.face_enhancement import FaceEnhancement
+            self.faceenhancer = FaceEnhancement()
         makedirs('./temp', exist_ok=True)
 
     def get_smoothened_boxes(self, boxes, T):
@@ -271,6 +276,8 @@ class Wav2LipPredictor(BasePredictor):
 
             for p, f, c in zip(pred, frames, coords):
                 y1, y2, x1, x2 = c
+                if self.face_enhancement:
+                    p = self.faceenhancer.enhance_from_image(p)
                 p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
 
                 f[y1:y2, x1:x2] = p
diff --git a/ppgan/faceutils/face_enhancement/__init__.py b/ppgan/faceutils/face_enhancement/__init__.py
new file mode 100644
index 0000000..f429a82
--- /dev/null
+++ b/ppgan/faceutils/face_enhancement/__init__.py
@@ -0,0 +1,15 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .face_enhance import FaceEnhancement
diff --git a/ppgan/faceutils/face_enhancement/face_enhance.py b/ppgan/faceutils/face_enhancement/face_enhance.py
new file mode 100644
index 0000000..055fc0b
--- /dev/null
+++ b/ppgan/faceutils/face_enhancement/face_enhance.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+import cv2
+import numpy as np
+from ppgan.utils.download import get_path_from_url
+from ppgan.models.generators import GPEN
+from ppgan.faceutils.face_detection.detection.blazeface.utils import *
+
+GPEN_weights = 'https://paddlegan.bj.bcebos.com/models/GPEN-512.pdparams'
+
+
+class FaceEnhancement(object):
+    def __init__(self,
+                 path_to_enhance=None,
+                 size = 512,
+                 batch_size=1
+                 ):
+        super(FaceEnhancement, self).__init__()
+
+        # Initialise the face detector
+        if path_to_enhance is None:
+            model_weights_path = get_path_from_url(GPEN_weights)
+            model_weights = paddle.load(model_weights_path)
+        else:
+            model_weights = paddle.load(path_to_enhance)
+            
+        self.face_enhance = GPEN(size=512, style_dim=512, n_mlp=8)
+        self.face_enhance.load_dict(model_weights)
+        self.face_enhance.eval()
+        self.size = size
+        self.mask = np.zeros((512, 512), np.float32)
+        cv2.rectangle(self.mask, (26, 26), (486, 486), (1, 1, 1), -1, cv2.LINE_AA)
+        self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+        self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+        self.mask = paddle.tile(paddle.to_tensor(self.mask).unsqueeze(0).unsqueeze(-1), repeat_times=[batch_size,1,1,3]).numpy()
+        
+
+    def enhance_from_image(self, img):
+        if isinstance(img, np.ndarray):
+            img, _ = resize_and_crop_image(img, 512)
+            img = paddle.to_tensor(img).transpose([2, 0, 1])
+        else:
+            assert img.shape == [3, 512, 512]
+        return self.enhance_from_batch(img.unsqueeze(0))[0]
+
+    def enhance_from_batch(self, img):
+        if isinstance(img, np.ndarray):
+            img_ori, _ = resize_and_crop_batch(img, 512)
+            img = paddle.to_tensor(img_ori).transpose([0, 3, 1, 2])
+        else:
+            assert img.shape[1:] == [3, 512, 512]
+            img_ori = img.transpose([0, 2, 3, 1]).numpy()
+        img_t = (img/255. - 0.5) / 0.5
+        
+        with paddle.no_grad():
+            out, __ = self.face_enhance(img_t)
+        
+        image_tensor = out * 0.5 + 0.5
+        image_tensor = image_tensor.transpose([0, 2, 3, 1]) # RGB
+        image_numpy = paddle.clip(image_tensor, 0, 1) * 255.0
+        
+        out = image_numpy.astype(np.uint8).cpu().numpy()
+        return out * self.mask + (1-self.mask) * img_ori 
diff --git a/ppgan/models/generators/__init__.py b/ppgan/models/generators/__init__.py
index 8df2ec1..18d7a43 100755
--- a/ppgan/models/generators/__init__.py
+++ b/ppgan/models/generators/__init__.py
@@ -32,3 +32,4 @@ from .generator_firstorder import FirstOrderGenerator
 from .generater_lapstyle import DecoderNet, Encoder, RevisionNet
 from .basicvsr import BasicVSRNet
 from .mpr import MPRNet
+from .gpen import GPEN
diff --git a/ppgan/models/generators/generator_styleganv2.py b/ppgan/models/generators/generator_styleganv2.py
index cabfe34..72a6c0a 100644
--- a/ppgan/models/generators/generator_styleganv2.py
+++ b/ppgan/models/generators/generator_styleganv2.py
@@ -136,18 +136,21 @@ class ModulatedConv2D(nn.Layer):
 
 
 class NoiseInjection(nn.Layer):
-    def __init__(self):
+    def __init__(self, is_concat=False):
         super().__init__()
 
         self.weight = self.create_parameter(
             (1, ), default_initializer=nn.initializer.Constant(0.0))
+        self.is_concat = is_concat
 
     def forward(self, image, noise=None):
         if noise is None:
             batch, _, height, width = image.shape
             noise = paddle.randn((batch, 1, height, width))
-
-        return image + self.weight * noise
+        if self.is_concat: 
+            return paddle.concat([image, self.weight * noise], axis=1)
+        else:
+            return image + self.weight * noise
 
 
 class ConstantInput(nn.Layer):
@@ -175,6 +178,7 @@ class StyledConv(nn.Layer):
         upsample=False,
         blur_kernel=[1, 3, 3, 1],
         demodulate=True,
+        is_concat=False
     ):
         super().__init__()
 
@@ -188,8 +192,8 @@ class StyledConv(nn.Layer):
             demodulate=demodulate,
         )
 
-        self.noise = NoiseInjection()
-        self.activate = FusedLeakyReLU(out_channel)
+        self.noise = NoiseInjection(is_concat=is_concat)
+        self.activate = FusedLeakyReLU(out_channel*2 if is_concat else out_channel)
 
     def forward(self, input, style, noise=None):
         out = self.conv(input, style)
@@ -240,6 +244,7 @@ class StyleGANv2Generator(nn.Layer):
         channel_multiplier=2,
         blur_kernel=[1, 3, 3, 1],
         lr_mlp=0.01,
+        is_concat=False
     ):
         super().__init__()
 
@@ -275,8 +280,9 @@ class StyleGANv2Generator(nn.Layer):
                                 self.channels[4],
                                 3,
                                 style_dim,
-                                blur_kernel=blur_kernel)
-        self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False)
+                                blur_kernel=blur_kernel,
+                                is_concat=is_concat)
+        self.to_rgb1 = ToRGB(self.channels[4]*2 if is_concat else self.channels[4], style_dim, upsample=False)
 
         self.log_size = int(math.log(size, 2))
         self.num_layers = (self.log_size - 2) * 2 + 1
@@ -299,26 +305,29 @@ class StyleGANv2Generator(nn.Layer):
 
             self.convs.append(
                 StyledConv(
-                    in_channel,
+                    in_channel*2 if is_concat else in_channel,
                     out_channel,
                     3,
                     style_dim,
                     upsample=True,
                     blur_kernel=blur_kernel,
+                    is_concat=is_concat,
                 ))
 
             self.convs.append(
-                StyledConv(out_channel,
+                StyledConv(out_channel*2 if is_concat else out_channel,
                            out_channel,
                            3,
                            style_dim,
-                           blur_kernel=blur_kernel))
+                           blur_kernel=blur_kernel,
+                           is_concat=is_concat))
 
-            self.to_rgbs.append(ToRGB(out_channel, style_dim))
+            self.to_rgbs.append(ToRGB(out_channel*2 if is_concat else out_channel, style_dim))
 
             in_channel = out_channel
 
         self.n_latent = self.log_size * 2 - 2
+        self.is_concat = is_concat
 
     def make_noise(self):
         noises = [paddle.randn((1, 1, 2**2, 2**2))]
@@ -395,16 +404,29 @@ class StyleGANv2Generator(nn.Layer):
         skip = self.to_rgb1(out, latent[:, 1])
 
         i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(self.convs[::2],
-                                                        self.convs[1::2],
-                                                        noise[1::2],
-                                                        noise[2::2],
-                                                        self.to_rgbs):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)
-
-            i += 2
+        if self.is_concat:
+            noise_i = 1
+
+            outs = []
+            for conv1, conv2, to_rgb in zip(
+                self.convs[::2], self.convs[1::2], self.to_rgbs):
+                out = conv1(out, latent[:, i], noise=noise[(noise_i + 1)//2]) ### 1 for 2
+                out = conv2(out, latent[:, i + 1], noise=noise[(noise_i + 2)//2]) ### 1 for 2
+                skip = to_rgb(out, latent[:, i + 2], skip)
+                
+                i += 2
+                noise_i += 2
+        else:    
+            for conv1, conv2, noise1, noise2, to_rgb in zip(self.convs[::2],
+                                                            self.convs[1::2],
+                                                            noise[1::2],
+                                                            noise[2::2],
+                                                            self.to_rgbs):
+                out = conv1(out, latent[:, i], noise=noise1)
+                out = conv2(out, latent[:, i + 1], noise=noise2)
+                skip = to_rgb(out, latent[:, i + 2], skip)
+
+                i += 2
 
         image = skip
 
diff --git a/ppgan/models/generators/gpen.py b/ppgan/models/generators/gpen.py
new file mode 100644
index 0000000..df72662
--- /dev/null
+++ b/ppgan/models/generators/gpen.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on https://github.com/yangxy/GPEN
+
+import paddle
+import paddle.nn as nn
+import math
+from ppgan.models.generators import StyleGANv2Generator
+from ppgan.models.discriminators.discriminator_styleganv2 import ConvLayer
+from ppgan.modules.equalized import EqualLinear
+
+class GPEN(nn.Layer):
+    def __init__(
+        self,
+        size,
+        style_dim,
+        n_mlp,
+        channel_multiplier=2,
+        blur_kernel=[1, 3, 3, 1],
+        lr_mlp=0.01,
+    ):
+        super(GPEN, self).__init__()
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256 * channel_multiplier,
+            128: 128 * channel_multiplier,
+            256: 64 * channel_multiplier,
+            512: 32 * channel_multiplier,
+            1024: 16 * channel_multiplier,
+        }
+
+        self.log_size = int(math.log(size, 2))
+        self.generator = StyleGANv2Generator(size, 
+                                             style_dim, 
+                                             n_mlp, 
+                                             channel_multiplier=channel_multiplier, 
+                                             blur_kernel=blur_kernel, 
+                                             lr_mlp=lr_mlp,
+                                             is_concat=True)
+        
+        conv = [ConvLayer(3, channels[size], 1)]
+        self.ecd0 = nn.Sequential(*conv)
+        in_channel = channels[size]
+
+        self.names = ['ecd%d'%i for i in range(self.log_size-1)]
+        for i in range(self.log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            conv = [ConvLayer(in_channel, out_channel, 3, downsample=True)] 
+            setattr(self, self.names[self.log_size-i+1], nn.Sequential(*conv))
+            in_channel = out_channel
+        self.final_linear = nn.Sequential(EqualLinear(channels[4] * 4 * 4, style_dim, activation='fused_lrelu'))
+
+    def forward(self,
+        inputs,
+        return_latents=False,
+        inject_index=None,
+        truncation=1,
+        truncation_latent=None,
+        input_is_latent=False,
+    ):
+        noise = []
+        for i in range(self.log_size-1):
+            ecd = getattr(self, self.names[i])
+            inputs = ecd(inputs)
+            noise.append(inputs)
+        inputs = inputs.reshape([inputs.shape[0], -1])
+        outs = self.final_linear(inputs)
+        outs = self.generator([outs], return_latents, inject_index, truncation,
+                              truncation_latent, input_is_latent, 
+                              noise=noise[::-1])
+        return outs
+
+
diff --git a/ppgan/models/generators/occlusion_aware.py b/ppgan/models/generators/occlusion_aware.py
index 0e01d9f..1ce8aa1 100644
--- a/ppgan/models/generators/occlusion_aware.py
+++ b/ppgan/models/generators/occlusion_aware.py
@@ -17,8 +17,10 @@
 import paddle
 from paddle import nn
 import paddle.nn.functional as F
-from ...modules.first_order import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d
+from ...modules.first_order import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d, make_coordinate_grid
 from ...modules.dense_motion import DenseMotionNetwork
+import numpy as np
+import cv2
 
 
 class OcclusionAwareGenerator(nn.Layer):
@@ -35,7 +37,8 @@ class OcclusionAwareGenerator(nn.Layer):
                  num_bottleneck_blocks,
                  estimate_occlusion_map=False,
                  dense_motion_params=None,
-                 estimate_jacobian=False):
+                 estimate_jacobian=False,
+                 inference=False):
         super(OcclusionAwareGenerator, self).__init__()
 
         if dense_motion_params is not None:
@@ -89,6 +92,8 @@ class OcclusionAwareGenerator(nn.Layer):
                                padding=(3, 3))
         self.estimate_occlusion_map = estimate_occlusion_map
         self.num_channels = num_channels
+        self.inference = inference
+        self.pad = 5
 
     def deform_input(self, inp, deformation):
         _, h_old, w_old, _ = deformation.shape
@@ -100,6 +105,16 @@ class OcclusionAwareGenerator(nn.Layer):
                                         mode='bilinear',
                                         align_corners=False)
             deformation = deformation.transpose([0, 2, 3, 1])
+        if self.inference:
+            identity_grid = make_coordinate_grid((h, w),
+                                                 type=inp.dtype)
+            identity_grid = identity_grid.reshape([1, h, w, 2])
+            visualization_matrix = np.zeros((h,w)).astype("float32")
+            visualization_matrix[self.pad:h-self.pad, self.pad:w-self.pad] = 1.0
+            gauss_kernel = paddle.to_tensor(cv2.GaussianBlur(visualization_matrix , (9, 9), 0.0, borderType=cv2.BORDER_ISOLATED))
+            gauss_kernel = gauss_kernel.unsqueeze(0).unsqueeze(-1)
+            deformation = gauss_kernel * deformation + (1-gauss_kernel) * identity_grid
+
         return F.grid_sample(inp,
                              deformation,
                              mode='bilinear',
@@ -136,6 +151,12 @@ class OcclusionAwareGenerator(nn.Layer):
                                                   size=out.shape[2:],
                                                   mode='bilinear',
                                                   align_corners=False)
+                if self.inference:
+                    h,w = occlusion_map.shape[2:]
+                    occlusion_map[:,:,0:self.pad,:] = 1.0
+                    occlusion_map[:,:,:,0:self.pad] = 1.0
+                    occlusion_map[:,:,h-self.pad:h,:] = 1.0
+                    occlusion_map[:,:,:,w-self.pad:w] = 1.0 
                 out = out * occlusion_map
 
             output_dict["deformed"] = self.deform_input(source_image,
-- 
GitLab