未验证 提交 8aafe376 编写于 作者: W wangna11BD 提交者: GitHub

Add msvsr (#496)

* fix benchmark dataset

* fix edvr and basic bug

* add PP-MSVSR

* add experiment results in docs

* fix ssim

* modif

* modif2
上级 6a31877f
......@@ -39,7 +39,7 @@ run_cmd="set -xe;
nvidia-docker run --name test_paddlegan -i \
--net=host \
--shm-size=1g \
--shm-size=128g \
-v $PWD:/workspace \
${ImageName} /bin/bash -c "${run_cmd}"
```
......
......@@ -59,7 +59,7 @@ dataset:
test:
name: VSRFolderDataset
# for udm10 dataset
# for UDM10 dataset
# lq_folder: data/udm10/BDx4
# gt_folder: data/udm10/GT
lq_folder: data/Vid4/BDx4
......@@ -67,7 +67,7 @@ dataset:
preprocess:
- name: GetNeighboringFramesIdx
interval_list: [1]
# for udm10 dataset
# for UDM10 dataset
# filename_tmpl: '{:04d}.png'
filename_tmpl: '{:08d}.png'
- name: ReadImageSequence
......
......@@ -23,8 +23,8 @@ dataset:
train:
name: RepeatDataset
times: 1000
num_workers: 4 # 6
batch_size: 2 # 4*2
num_workers: 4
batch_size: 2 #4 GPUs
dataset:
name: SRREDSMultipleGTDataset
mode: train
......
......@@ -43,7 +43,7 @@ dataset:
scale: 4
fix_random_seed: 10
num_workers: 3
batch_size: 4 # 8GUPs
batch_size: 4 # 8GPUs
test:
......
......@@ -42,7 +42,7 @@ dataset:
scale: 4
fix_random_seed: 10
num_workers: 3
batch_size: 4 # 8GUPs
batch_size: 4 # 8GPUs
test:
......
......@@ -46,7 +46,7 @@ dataset:
scale: 4
fix_random_seed: 10
num_workers: 3
batch_size: 4 # 8GUPs
batch_size: 4 # 8GPUs
test:
......
......@@ -42,7 +42,7 @@ dataset:
scale: 4
fix_random_seed: 10
num_workers: 3
batch_size: 4 # 8GUPs
batch_size: 4 # 8GPUs
test:
......
......@@ -23,8 +23,8 @@ dataset:
train:
name: RepeatDataset
times: 1000
num_workers: 4 # 6
batch_size: 2 # 4*2
num_workers: 4
batch_size: 2 #4 GPUs
dataset:
name: SRREDSMultipleGTDataset
mode: train
......
......@@ -32,7 +32,7 @@ dataset:
load_size: 136
crop_size: 128
num_workers: 16
batch_size: 5
batch_size: 5 #1 GPUs
test:
name: LapStyleDataset
content_root: data/coco/test2017/
......
......@@ -38,7 +38,7 @@ dataset:
load_size: 280
crop_size: 256
num_workers: 16
batch_size: 5
batch_size: 5 #1 GPUs
test:
name: LapStyleDataset
content_root: data/coco/test2017/
......
......@@ -38,7 +38,7 @@ dataset:
load_size: 540
crop_size: 512
num_workers: 16
batch_size: 2
batch_size: 2 #1 GPUs
test:
name: LapStyleDataset
content_root: data/coco/test2017/
......
total_iters: 300000
output_dir: output_dir
find_unused_parameters: True
checkpoints_dir: checkpoints
use_dataset: True
# tensor range for function tensor2img
min_max:
(0., 1.)
model:
name: MultiStageVSRModel
fix_iter: 2500
generator:
name: MSVSR
mid_channels: 64
num_init_blocks: 5
num_blocks: 7
num_reconstruction_blocks: 5
only_last: False
use_tiny_spynet: False
deform_groups: 8
stage1_groups: 8
auxiliary_loss: True
use_refine_align: True
aux_reconstruction_blocks: 2
use_local_connnect: True
pixel_criterion:
name: CharbonnierLoss
reduction: mean
dataset:
train:
name: RepeatDataset
times: 1000
num_workers: 4
batch_size: 2 #8 gpus
use_shared_memory: True
dataset:
name: SRREDSMultipleGTDataset
mode: train
lq_folder: data/REDS/train_sharp_bicubic/X4
gt_folder: data/REDS/train_sharp/X4
crop_size: 256
interval_list: [1]
random_reverse: False
number_frames: 30
use_flip: True
use_rot: True
scale: 4
val_partition: REDS4
test:
name: SRREDSMultipleGTDataset
mode: test
lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
gt_folder: data/REDS/REDS4_test_sharp/X4
interval_list: [1]
random_reverse: False
number_frames: 100
use_flip: False
use_rot: False
scale: 4
val_partition: REDS4
num_workers: 0
batch_size: 1
lr_scheduler:
name: CosineAnnealingRestartLR
learning_rate: !!float 2e-4
periods: [300000]
restart_weights: [1]
eta_min: !!float 1e-7
optimizer:
name: Adam
# add parameters of net_name to optim
# name should in self.nets
net_names:
- generator
beta1: 0.9
beta2: 0.99
validate:
interval: 5000
save_img: false
metrics:
psnr: # metric name, can be arbitrary
name: PSNR
crop_border: 0
test_y_channel: false
ssim:
name: SSIM
crop_border: 0
test_y_channel: false
log_config:
interval: 10
visiual_interval: 5000
snapshot_config:
interval: 5000
total_iters: 150000
output_dir: output_dir
find_unused_parameters: True
checkpoints_dir: checkpoints
use_dataset: True
# tensor range for function tensor2img
min_max:
(0., 1.)
model:
name: MultiStageVSRModel
fix_iter: 2500
generator:
name: MSVSR
mid_channels: 32
num_init_blocks: 2
num_blocks: 3
num_reconstruction_blocks: 2
only_last: True
use_tiny_spynet: True
deform_groups: 4
stage1_groups: 8
auxiliary_loss: True
use_refine_align: True
aux_reconstruction_blocks: 1
use_local_connnect: True
pixel_criterion:
name: CharbonnierLoss
reduction: mean
dataset:
train:
name: RepeatDataset
times: 1000
num_workers: 6
batch_size: 2 #8 gpus
use_shared_memory: True
dataset:
name: SRREDSMultipleGTDataset
mode: train
lq_folder: data/REDS/train_sharp_bicubic/X4
gt_folder: data/REDS/train_sharp/X4
crop_size: 256
interval_list: [1]
random_reverse: False
number_frames: 20
use_flip: True
use_rot: True
scale: 4
val_partition: REDS4
test:
name: SRREDSMultipleGTDataset
mode: test
lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
gt_folder: data/REDS/REDS4_test_sharp/X4
interval_list: [1]
random_reverse: False
number_frames: 100
use_flip: False
use_rot: False
scale: 4
val_partition: REDS4
num_workers: 0
batch_size: 1
lr_scheduler:
name: CosineAnnealingRestartLR
learning_rate: !!float 2e-4
periods: [150000]
restart_weights: [1]
eta_min: !!float 1e-7
optimizer:
name: Adam
# add parameters of net_name to optim
# name should in self.nets
net_names:
- generator
beta1: 0.9
beta2: 0.99
validate:
interval: 5000
save_img: false
metrics:
psnr: # metric name, can be arbitrary
name: PSNR
crop_border: 0
test_y_channel: false
ssim:
name: SSIM
crop_border: 0
test_y_channel: false
log_config:
interval: 10
visiual_interval: 5000
snapshot_config:
interval: 5000
total_iters: 300000
output_dir: output_dir
find_unused_parameters: True
checkpoints_dir: checkpoints
use_dataset: True
# tensor range for function tensor2img
min_max:
(0., 1.)
model:
name: MultiStageVSRModel
fix_iter: -1
generator:
name: MSVSR
mid_channels: 32
num_init_blocks: 2
num_blocks: 3
num_reconstruction_blocks: 2
only_last: True
use_tiny_spynet: True
deform_groups: 4
stage1_groups: 8
auxiliary_loss: True
use_refine_align: True
aux_reconstruction_blocks: 1
use_local_connnect: True
pixel_criterion:
name: CharbonnierLoss
reduction: mean
dataset:
train:
name: RepeatDataset
times: 1000
num_workers: 4
batch_size: 2 #8 gpus
dataset:
name: VSRVimeo90KDataset
# mode: train
lq_folder: data/vimeo90k/vimeo_septuplet_BD_matlabLRx4/sequences
gt_folder: data/vimeo90k/vimeo_septuplet/sequences
ann_file: data/vimeo90k/vimeo_septuplet/sep_trainlist.txt
preprocess:
- name: ReadImageSequence
key: lq
- name: ReadImageSequence
key: gt
- name: Transforms
input_keys: [lq, gt]
pipeline:
- name: SRPairedRandomCrop
gt_patch_size: 256
scale: 4
keys: [image, image]
- name: PairedRandomHorizontalFlip
keys: [image, image]
- name: PairedRandomVerticalFlip
keys: [image, image]
- name: PairedRandomTransposeHW
keys: [image, image]
- name: TransposeSequence
keys: [image, image]
- name: MirrorVideoSequence
- name: NormalizeSequence
mean: [0., .0, 0.]
std: [255., 255., 255.]
keys: [image, image]
test:
name: VSRFolderDataset
# for udm10 dataset
# lq_folder: data/udm10/BDx4
# gt_folder: data/udm10/GT
lq_folder: data/Vid4/BDx4
gt_folder: data/Vid4/GT
preprocess:
- name: GetNeighboringFramesIdx
interval_list: [1]
# for udm10 dataset
# filename_tmpl: '{:04d}.png'
filename_tmpl: '{:08d}.png'
- name: ReadImageSequence
key: lq
- name: ReadImageSequence
key: gt
- name: Transforms
input_keys: [lq, gt]
pipeline:
- name: TransposeSequence
keys: [image, image]
- name: NormalizeSequence
mean: [0., .0, 0.]
std: [255., 255., 255.]
keys: [image, image]
lr_scheduler:
name: CosineAnnealingRestartLR
learning_rate: !!float 2e-4
periods: [300000]
restart_weights: [1]
eta_min: !!float 1e-7
optimizer:
name: Adam
# add parameters of net_name to optim
# name should in self.nets
net_names:
- generator
beta1: 0.9
beta2: 0.99
validate:
interval: 2500
save_img: false
metrics:
psnr: # metric name, can be arbitrary
name: PSNR
crop_border: 0
test_y_channel: true
ssim:
name: SSIM
crop_border: 0
test_y_channel: true
log_config:
interval: 10
visiual_interval: 5000
snapshot_config:
interval: 2500
......@@ -3,15 +3,22 @@
## 1.1 Principle
Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf).[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf), [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf), and PP-MSVSR.
[EDVR](https://arxiv.org/pdf/1905.02716.pdf) wins the champions and outperforms the second place by a large margin in all four tracks in the NTIRE19 video restoration and enhancement challenges. The main difficulties of video super-resolution from two aspects: (1) how to align multiple frames given large motions, and (2) how to effectively fuse different frames with diverse motion and blur. First, to handle large motions, EDVR devise a Pyramid, Cascading and Deformable (PCD) alignment module, in which frame alignment is done at the feature level using deformable convolutions in a coarse-to-fine manner. Second, EDVR propose a Temporal and Spatial Attention (TSA) fusion module, in which attention is applied both temporally and spatially, so as to emphasize important features for subsequent restoration.
[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf) reconsiders some most essential components for VSR guided by four basic functionalities, i.e., Propagation, Alignment, Aggregation, and Upsampling. By reusing some existing components added with minimal redesigns, a succinct pipeline, BasicVSR, achieves appealing improvements in terms of speed and restoration quality in comparison to many state-of-the-art algorithms. By presenting an informationrefill mechanism and a coupled propagation scheme to facilitate information aggregation, the BasicVSR can be expanded to [IconVSR](https://arxiv.org/pdf/2012.02181.pdf), which can serve as strong baselines for future VSR approaches.
[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf) redesign BasicVSR by proposing second-order grid propagation and flowguided deformable alignment. By empowering the recurrent framework with the enhanced propagation and alignment, BasicVSR++ can exploit spatiotemporal information across misaligned video frames more effectively. The new components lead to an improved performance under a similar computational constraint. In particular, BasicVSR++ surpasses BasicVSR by 0.82 dB in PSNR with similar number of parameters. In NTIRE 2021, BasicVSR++ obtains three champions and one runner-up in the Video Super-Resolution and Compressed Video Enhancement Challenges.
PP-MSVSR is a multi-stage VSR deep architecture, with local fusion module, auxiliary loss and refined align module to refine the enhanced result progressively. Specifically, in order to strengthen the fusion of features across frames in feature propagation, a local fusion module is designed in stage-1 to perform local feature fusion before feature propagation. Moreover, an auxiliary loss in stage-2 is introduced to make the features obtained by the propagation module reserve more correlated information connected to the HR space, and introduced a refined align module in stage-3 to make full use of the feature information of the previous stage. Extensive experiments substantiate that PP-MSVSR achieves a promising performance of Vid4 datasets, which PSNR metric can achieve 28.13 with only 1.45M parameters.
## 1.2 How to use
### 1.2.1 Prepare Datasets
Here are 4 commonly used video super-resolution dataset, REDS, Vimeo90K, Vid4, UDM10. The REDS and Vimeo90K dataset include train dataset and test dataset, Vid4 and UDM10 are test dataset. Download and decompress the required dataset and place it under the ``PaddleGAN/data``.
REDS([download](https://seungjunnah.github.io/Datasets/reds.html))is a newly proposed high-quality (720p) video dataset in the NTIRE19 Competition. REDS consists of 240 training clips, 30 validation clips and 30 testing clips (each with 100 consecutive frames). Since the test ground truth is not available, we select four representative clips (they are '000', '011', '015', '020', with diverse scenes and motions) as our test set, denoted by REDS4. The remaining training and validation clips are re-grouped as our training dataset (a total of 266 clips).
......@@ -31,6 +38,49 @@
...
```
Vimeo90K ([download](http://toflow.csail.mit.edu/)) is designed by Tianfan Xue etc. for the following four video processing tasks: temporal frame interpolation, video denoising, video deblocking, and video super-resolution. Vimeo90K is a large-scale, high-quality video dataset. This dataset consists of 89,800 video clips downloaded from vimeo.com, which covers large variaty of scenes and actions.
The structure of the processed Vimeo90K is as follows:
```
PaddleGAN
├── data
├── Vimeo90K
├── vimeo_septuplet
| |──sequences
| └──sep_trainlist.txt
├── vimeo_septuplet_BD_matlabLRx4
| └──sequences
└── vimeo_super_resolution_test
|──low_resolution
|──target
└──sep_testlist.txt
...
```
Vid4 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)) is a commonly used test dataset for VSR, which contains 4 video segments.
The structure of the processed Vid4 is as follows:
```
PaddleGAN
├── data
├── Vid4
├── BDx4
└── GT
...
```
UDM10 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)) is a commonly used test dataset for VSR, which contains 10 video segments.
The structure of the processed UDM10 is as follows:
```
PaddleGAN
├── data
├── udm10
├── BDx4
└── GT
...
```
### 1.2.2 Train/Test
According to the number of channels, EDVR are divided into EDVR_L(128 channels) and EDVR_M (64 channels). Then, taking EDVR_M as an example, the model training and testing are introduced.
......@@ -63,24 +113,37 @@
python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
```
To train or test other VSR model, you can find the config file of the corresponding VSR model in the ``PaddleGAN/configs``, then change the config file in the command to the config file of corresponding VSR model.
## 1.3 Results
The experimental results are evaluated on RGB channel.
The metrics are PSNR / SSIM.
VSR quantitative comparis on the test dataset REDS4 from REDS dataset
| Method | Paramete(M) | FLOPs(G) | REDS4 |
|---|---|---|---|
| EDVR_M_wo_tsa_SRx4 | 3.00 | 223 | 30.4429 / 0.8684 |
| EDVR_M_w_tsa_SRx4 | 3.30 | 232 | 30.5169 / 0.8699 |
| EDVR_L_wo_tsa_SRx4 | 19.42 | 974 | 30.8649 / 0.8761 |
| EDVR_L_w_tsa_SRx4 | 20.63 | 1010 | 30.9336 / 0.8773 |
| BasicVSR_x4 | 6.29 | 374 | 31.4325 / 0.8913 |
| IconVSR_x4 | 8.69 | 516 | 31.6882 / 0.8950 |
| BasicVSR++_x4 | 7.32 | 406 | 32.4018 / 0.9071 |
| PP-MSVSR_reds_x4 | 1.45 | 111 | 31.2535 / 0.8884 |
| PP-MSVSR-L_reds_x4 | 7.42 | 543 | 32.5321 / 0.9083 |
Deblur quantitative comparis on the test dataset REDS4 from REDS dataset
| Method | REDS4 |
|---|---|
| EDVR_M_wo_tsa_SRx4 | 30.4429 / 0.8684 |
| EDVR_M_w_tsa_SRx4 | 30.5169 / 0.8699 |
| EDVR_L_wo_tsa_SRx4 | 30.8649 / 0.8761 |
| EDVR_L_w_tsa_SRx4 | 30.9336 / 0.8773 |
| EDVR_L_wo_tsa_deblur | 34.9587 / 0.9509 |
| EDVR_L_w_tsa_deblur | 35.1473 / 0.9526 |
| BasicVSR_x4 | 31.4325 / 0.8913 |
| IconVSR_x4 | 31.6882 / 0.8950 |
| BasicVSR++_x4 | 32.4018 / 0.9071 |
VSR quantitative comparis on the Vimeo90K, Vid4, UDM10
| Model | Vimeo90K | Vid4 | UDM10 |
|---|---|---|---|
| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
## 1.4 Model Download
| Method | Dataset | Download Link |
......@@ -94,7 +157,9 @@ The metrics are PSNR / SSIM.
| BasicVSR_x4 | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
| IconVSR_x4 | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
| BasicVSR++_x4 | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
| PP-MSVSR_reds_x4 | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
| PP-MSVSR-L_reds_x4 | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
| PP-MSVSR_vimeo90k_x4 | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
......@@ -133,3 +198,10 @@ The metrics are PSNR / SSIM.
year = {2021}
}
```
- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
```
@article{
}
```
......@@ -3,16 +3,22 @@
## 1.1 原理介绍
视频超分源于图像超分,其目的是从一个或多个低分辨率(LR)图像中恢复高分辨率(HR)图像。它们的区别也很明显,由于视频是由多个帧组成的,所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf),[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
视频超分源于图像超分,其目的是从一个或多个低分辨率(LR)图像中恢复高分辨率(HR)图像。它们的区别也很明显,由于视频是由多个帧组成的,所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf)[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)[IconVSR](https://arxiv.org/pdf/2012.02181.pdf)[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)和PP-MSVSR。
[EDVR](https://arxiv.org/pdf/1905.02716.pdf)模型在NTIRE19视频恢复和增强挑战赛的四个赛道中都赢得了冠军,并以巨大的优势超过了第二名。视频超分的主要难点在于(1)如何在给定大运动的情况下对齐多个帧;(2)如何有效地融合具有不同运动和模糊的不同帧。首先,为了处理大的运动,EDVR模型设计了一个金字塔级联的可变形(PCD)对齐模块,在该模块中,从粗到精的可变形卷积被使用来进行特征级的帧对齐。其次,EDVR使用了时空注意力(TSA)融合模块,该模块在时间和空间上同时应用注意力机制,以强调后续恢复的重要特征。
[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)在VSR的指导下重新考虑了四个基本模块(即传播、对齐、聚合和上采样)的一些最重要的组件。 通过添加一些小设计,重用一些现有组件,得到了简洁的 BasicVSR。与许多最先进的算法相比,BasicVSR在速度和恢复质量方面实现了有吸引力的改进。 同时,通过添加信息重新填充机制和耦合传播方案以促进信息聚合,BasicVSR 可以扩展为 [IconVSR](https://arxiv.org/pdf/2012.02181.pdf),IconVSR可以作为未来 VSR 方法的强大基线 .
[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)通过提出二阶网格传播和导流可变形对齐来重新设计BasicVSR。通过增强传播和对齐来增强循环框架,BasicVSR++可以更有效地利用未对齐视频帧的时空信息。 在类似的计算约束下,新组件可提高性能。特别是,BasicVSR++ 以相似的参数数量在 PSNR 方面比 BasicVSR 高0.82dB。BasicVSR++ 在NTIRE2021的视频超分辨率和压缩视频增强挑战赛中获得三名冠军和一名亚军。
PP-MSVSR是一种多阶段视频超分深度架构,具有局部融合模块、辅助损失和细化对齐模块,以逐步细化增强结果。具体来说,在第一阶段设计了局部融合模块,在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失,使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块,以充分利用前一阶段传播模块的特征信息。大量实验证实,PP-MSVSR在Vid4数据集性能优异,仅使用 1.45M 参数PSNR指标即可达到28.13dB。
## 1.2 如何使用
### 1.2.1 数据准备
这里提供4个视频超分辨率常用数据集,REDS,Vimeo90K,Vid4,UDM10。其中REDS和vimeo90k数据集包括训练集和测试集,Vid4和UDM10为测试数据集。将需要的数据集下载解压后放到``PaddleGAN/data``文件夹下 。
REDS([数据下载](https://seungjunnah.github.io/Datasets/reds.html))数据集是NTIRE19公司最新提出的高质量(720p)视频数据集,其由240个训练片段、30个验证片段和30个测试片段组成(每个片段有100个连续帧)。由于测试数据集不可用,这里在训练集选择了四个具有代表性的片段(分别为'000', '011', '015', '020',它们具有不同的场景和动作)作为测试集,用REDS4表示。剩下的训练和验证片段被重新分组为训练数据集(总共266个片段)。
处理后的数据集 REDS 的组成形式如下:
......@@ -31,6 +37,49 @@
...
```
Vimeo90K([数据下载](http://toflow.csail.mit.edu/))数据集是Tianfan Xue等人构建的一个用于视频超分、视频降噪、视频去伪影、视频插帧的数据集。Vimeo90K是大规模、高质量的视频数据集,包含从vimeo.com下载的 89,800 个视频剪辑,涵盖了大量场景和动作。
处理后的数据集 Vimeo90K 的组成形式如下:
```
PaddleGAN
├── data
├── Vimeo90K
├── vimeo_septuplet
| |──sequences
| └──sep_trainlist.txt
├── vimeo_septuplet_BD_matlabLRx4
| └──sequences
└── vimeo_super_resolution_test
|──low_resolution
|──target
└──sep_testlist.txt
...
```
Vid4([数据下载](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip))数据集是常用的视频超分验证数据集,包含4个视频段。
处理后的数据集 Vid4 的组成形式如下:
```
PaddleGAN
├── data
├── Vid4
├── BDx4
└── GT
...
```
UDM10([数据下载](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar))数据集是常用的视频超分验证数据集,包含10个视频段。
处理后的数据集 UDM10 的组成形式如下:
```
PaddleGAN
├── data
├── udm10
├── BDx4
└── GT
...
```
### 1.2.2 训练/测试
EDVR模型根据模型中间通道数分为EDVR_L(128通道)和EDVR_M(64通道)两种模型。下面以EDVR_M模型为例介绍模型训练与测试。
......@@ -59,23 +108,37 @@
python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
```
训练或测试其他视频超分模型,可以在``PaddleGAN/configs``文件夹下找到对应模型的配置文件,将命令中的配置文件改成该视频超分模型的配置文件即可。
## 1.3 实验结果展示
实验数值结果是在 RGB 通道上进行评估。
度量指标为 PSNR / SSIM.
REDS的测试数据集REDS4上的超分性能对比
| 模型| 参数量(M) | 计算量(G) | REDS4 |
|---|---|---|---|
| EDVR_M_wo_tsa_SRx4 | 3.00 | 223 | 30.4429 / 0.8684 |
| EDVR_M_w_tsa_SRx4 | 3.30 | 232 | 30.5169 / 0.8699 |
| EDVR_L_wo_tsa_SRx4 | 19.42 | 974 | 30.8649 / 0.8761 |
| EDVR_L_w_tsa_SRx4 | 20.63 | 1010 | 30.9336 / 0.8773 |
| BasicVSR_x4 | 6.29 | 374 | 31.4325 / 0.8913 |
| IconVSR_x4 | 8.69 | 516 | 31.6882 / 0.8950 |
| BasicVSR++_x4 | 7.32 | 406 | 32.4018 / 0.9071 |
| PP-MSVSR_reds_x4 | 1.45 | 111 | 31.2535 / 0.8884 |
| PP-MSVSR-L_reds_x4 | 7.42 | 543 | 32.5321 / 0.9083 |
REDS的测试数据集REDS4上的去模糊性能对比
| 模型 | REDS4 |
|---|---|
| EDVR_M_wo_tsa_SRx4 | 30.4429 / 0.8684 |
| EDVR_M_w_tsa_SRx4 | 30.5169 / 0.8699 |
| EDVR_L_wo_tsa_SRx4 | 30.8649 / 0.8761 |
| EDVR_L_w_tsa_SRx4 | 30.9336 / 0.8773 |
| EDVR_L_wo_tsa_deblur | 34.9587 / 0.9509 |
| EDVR_L_w_tsa_deblur | 35.1473 / 0.9526 |
| BasicVSR_x4 | 31.4325 / 0.8913 |
| IconVSR_x4 | 31.6882 / 0.8950 |
| BasicVSR++_x4 | 32.4018 / 0.9071 |
Vimeo90K,Vid4,UDM10测试数据集上超分性能对比
| 模型 | Vimeo90K | Vid4 | UDM10 |
|---|---|---|---|
| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
## 1.4 模型下载
| 模型 | 数据集 | 下载地址 |
......@@ -89,8 +152,9 @@
| BasicVSR_x4 | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
| IconVSR_x4 | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
| BasicVSR++_x4 | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
| PP-MSVSR_reds_x4 | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
| PP-MSVSR-L_reds_x4 | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
| PP-MSVSR_vimeo90k_x4 | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
# 参考文献
......@@ -125,3 +189,10 @@
year = {2021}
}
```
- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
```
@article{
}
```
......@@ -212,8 +212,8 @@ def calculate_ssim(img1,
f'Wrong input_order {input_order}. Supported input_orders are '
'"HWC" and "CHW"')
img1 = img1.copy().astype('float32')[..., ::-1]
img2 = img2.copy().astype('float32')[..., ::-1]
img1 = img1.copy().astype('float32')
img2 = img2.copy().astype('float32')
img1 = reorder_image(img1, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
......
......@@ -33,3 +33,4 @@ from .lapstyle_model import LapStyleDraModel, LapStyleRevFirstModel, LapStyleRev
from .basicvsr_model import BasicVSRModel
from .mpr_model import MPRModel
from .photopen_model import PhotoPenModel
from .msvsr_model import MultiStageVSRModel
......@@ -37,3 +37,4 @@ from .gpen import GPEN
from .pan import PAN
from .generater_photopen import SPADEGenerator
from .basicvsr_plus_plus import BasicVSRPlusPlus
from .msvsr import MSVSR
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.vision.ops import DeformConv2D
from .basicvsr import PixelShufflePack, flow_warp, SPyNet, ResidualBlocksWithInputConv
from ...utils.download import get_path_from_url
from ...modules.init import kaiming_normal_, constant_
from .builder import GENERATORS
@GENERATORS.register()
class MSVSR(nn.Layer):
"""PP-MSVSR network structure for video super-resolution.
Support only x4 upsampling.
Paper:
PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
Args:
mid_channels (int): Channel number of the intermediate features.
Default: 32.
num_init_blocks (int): Number of residual blocks in feat_extract.
Default: 2.
num_blocks (int): Number of residual blocks in each propagation branch.
Default: 3.
num_reconstruction_blocks (int): Number of residual blocks in reconstruction.
Default: 2.
only_last (bool): Whether the hr feature only do the last convolution.
Default: True.
use_tiny_spynet (bool): Whether use tiny spynet.
Default: True.
deform_groups (int): Number of deformable_groups in DeformConv2D in stage2 and stage3.
Defaults: 4.
stage1_groups (int): Number of deformable_groups in DeformConv2D in stage1.
Defaults: 8.
auxiliary_loss (bool): Whether use auxiliary loss.
Default: True.
use_refine_align (bool): Whether use refine align.
Default: True.
aux_reconstruction_blocks : Number of residual blocks in auxiliary reconstruction.
Default: 1.
use_local_connnect (bool): Whether add feature of stage1 after upsample.
Default: True.
"""
def __init__(self,
mid_channels=32,
num_init_blocks=2,
num_blocks=3,
num_reconstruction_blocks=2,
only_last=True,
use_tiny_spynet=True,
deform_groups=4,
stage1_groups=8,
auxiliary_loss=True,
use_refine_align=True,
aux_reconstruction_blocks=1,
use_local_connnect=True):
super().__init__()
self.mid_channels = mid_channels
self.only_last = only_last
self.deform_groups = deform_groups
self.auxiliary_loss = auxiliary_loss
self.use_refine_align = use_refine_align
self.use_local_connnect = use_local_connnect
# optical flow module
if use_tiny_spynet:
self.spynet = ModifiedSPyNet(num_blocks=3, use_tiny_block=True)
weight_path = get_path_from_url(
'https://paddlegan.bj.bcebos.com/models/modified_spynet_tiny.pdparams'
)
self.spynet.set_state_dict(paddle.load(weight_path))
else:
self.spynet = ModifiedSPyNet(num_blocks=6, use_tiny_block=False)
weight_path = get_path_from_url(
'https://paddlegan.bj.bcebos.com/models/modified_spynet.pdparams'
)
self.spynet.set_state_dict(paddle.load(weight_path))
# feature extraction module
self.feat_extract = ResidualBlocksWithInputConv(3, mid_channels,
num_init_blocks)
# propagation branches module for stage2 and stage3
self.deform_align = nn.LayerDict()
self.backbone = nn.LayerDict()
prop_names = [
'stage2_backward', 'stage2_forward', 'stage3_backward',
'stage3_forward'
]
for i, layer in enumerate(prop_names):
if i > 1 and self.use_refine_align:
self.deform_align[layer] = ReAlignmentModule(
mid_channels,
mid_channels,
3,
padding=1,
deformable_groups=deform_groups)
else:
self.deform_align[layer] = AlignmentModule(
mid_channels,
mid_channels,
3,
padding=1,
deformable_groups=deform_groups)
self.backbone[layer] = ResidualBlocksWithInputConv(
(3 + i) * mid_channels, mid_channels, num_blocks)
# stage1
self.stage1_align = AlignmentModule(mid_channels,
mid_channels,
3,
padding=1,
deformable_groups=stage1_groups)
self.stage1_blocks = ResidualBlocksWithInputConv(
3 * mid_channels, mid_channels, 3)
# upsampling module
self.reconstruction = ResidualBlocksWithInputConv(
6 * mid_channels, mid_channels, num_reconstruction_blocks)
self.upsample1 = PixelShufflePack(mid_channels,
mid_channels,
2,
upsample_kernel=3)
self.upsample2 = PixelShufflePack(mid_channels,
mid_channels,
2,
upsample_kernel=3)
if self.only_last:
self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
else:
self.conv_hr = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
self.img_upsample = nn.Upsample(scale_factor=4,
mode='bilinear',
align_corners=False)
# activation function
self.lrelu = nn.LeakyReLU(negative_slope=0.1)
# auxiliary loss
if self.auxiliary_loss:
self.aux_fusion = nn.Conv2D(mid_channels * 2, mid_channels, 3, 1, 1)
self.aux_reconstruction = ResidualBlocksWithInputConv(
4 * mid_channels, mid_channels, aux_reconstruction_blocks)
self.aux_block_down1 = nn.Sequential(
nn.Conv2D(3 + mid_channels, mid_channels, 3, 2, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
self.aux_block_down2 = nn.Sequential(
nn.Conv2D(mid_channels * 2, mid_channels, 3, 2, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
self.aux_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
self.aux_upsample1 = PixelShufflePack(mid_channels,
mid_channels,
2,
upsample_kernel=3)
self.aux_upsample2 = PixelShufflePack(mid_channels,
mid_channels,
2,
upsample_kernel=3)
self.hybrid_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
def check_if_mirror_extended(self, lrs):
"""Check whether the input is a mirror-extended sequence.
If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
(t-1-i)-th frame.
Args:
lrs (tensor): Input LR images with shape (n, t, c, h, w)
Returns:
Bool: Whether the input is a mirror-extended sequence.
"""
with paddle.no_grad():
self.is_mirror_extended = False
if lrs.shape[1] % 2 == 0:
lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
lrs_2 = paddle.flip(lrs_2, [1])
if paddle.norm(lrs_1 - lrs_2) == 0:
self.is_mirror_extended = True
def compute_flow(self, lrs):
"""Compute optical flow using pretrained flow network for feature alignment.
Args:
lrs (tensor): Input LR images with shape (n, t, c, h, w)
Returns:
Tuple: Tensor of forward optical flow and backward optical flow with shape (n, t-1, 2, h, w).
"""
n, t, c, h, w = lrs.shape
lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
if self.is_mirror_extended:
flows_forward = flows_backward.flip(1)
else:
flows_forward = self.spynet(lrs_2,
lrs_1).reshape([n, t - 1, 2, h, w])
return flows_forward, flows_backward
def stage1(self, feats, flows, flows_forward=None):
"""Stage1 of PP-MSVSR network.
Args:
feats (dict): Dict with key 'spatial', the value is Array of tensor after feature extraction with shape (n, c, h, w).
flows (tensor): Backward optical flow with shape (n, t-1, 2, h, w).
flows_forward (tensor): Forward optical flow with shape (n, t-1, 2, h, w).
Returns:
Dict: The input dict with new keys 'feat_stage1', the value of 'feat_stage1' is Array of tensor after Local Fusion Module with shape (n, c, h, w).
"""
n, t, _, h, w = flows.shape
frame_idx = range(t, -1, -1)
flow_idx = range(t, -1, -1)
mapping_idx = list(range(0, len(feats['spatial'])))
mapping_idx += mapping_idx[::-1]
# Local Fusion Module
for i, idx in enumerate(frame_idx):
feat_current = feats['spatial'][mapping_idx[idx]]
# get aligned right adjacent frames
if i > 0:
feat_prop = feats['spatial'][mapping_idx[idx + 1]]
flow_n1 = flows[:, flow_idx[i], :, :, :]
cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
cond = paddle.concat([cond_n1, feat_current], axis=1)
feat_prop, _, _ = self.stage1_align(feat_prop, cond, flow_n1)
else:
feat_prop = paddle.zeros([n, self.mid_channels, h, w])
# get aligned left adjacent frames
if i < t:
feat_back = feats['spatial'][mapping_idx[idx - 1]]
flow_n1_ = flows_forward[:, flow_idx[i] - 1, :, :, :]
cond_n1_ = flow_warp(feat_back, flow_n1_.transpose([0, 2, 3,
1]))
cond_ = paddle.concat([cond_n1_, feat_current], axis=1)
feat_back, _, _ = self.stage1_align(feat_back, cond_, flow_n1_)
else:
feat_back = paddle.zeros([n, self.mid_channels, h, w])
# concatenate and residual blocks
feat = [feat_current] + [feat_prop] + [feat_back]
feat = paddle.concat(feat, axis=1)
feat = self.stage1_blocks(feat)
feats['feat_stage1'].append(feat)
feats['feat_stage1'] = feats['feat_stage1'][::-1]
return feats
def stage2(self, feats, flows):
"""Stage2 of PP-MSVSR network.
Args:
feats (dict): Dict with key 'spatial' and 'feat_stage1' after stage1.
flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
Returns:
feats (dict): The input dict with new keys 'stage2_backward' and 'stage2_forward', the value of both is Array of feature after stage2 with shape (n, c, h, w).
pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
"""
flows_backward, flows_forward = flows
n, t, _, h, w = flows_backward.shape
pre_offset = {}
pre_mask = {}
# propagation branches module
for prop_name in ['stage2_backward', 'stage2_forward']:
pre_offset[prop_name] = [0 for _ in range(t)]
pre_mask[prop_name] = [0 for _ in range(t)]
feats[prop_name] = []
frame_idx = range(0, t + 1)
flow_idx = range(-1, t)
mapping_idx = list(range(0, len(feats['spatial'])))
mapping_idx += mapping_idx[::-1]
if 'backward' in prop_name:
frame_idx = frame_idx[::-1]
flow_idx = frame_idx
flows = flows_backward
else:
flows = flows_forward
feat_prop = paddle.zeros([n, self.mid_channels, h, w])
for i, idx in enumerate(frame_idx):
feat_current = feats['spatial'][mapping_idx[idx]]
if i > 0:
flow_n1 = flows[:, flow_idx[i], :, :, :]
cond_n1 = flow_warp(feat_prop,
flow_n1.transpose([0, 2, 3, 1]))
cond = paddle.concat([cond_n1, feat_current], axis=1)
feat_prop, offset, mask = self.deform_align[prop_name](
feat_prop, cond, flow_n1)
pre_offset[prop_name][flow_idx[i]] = offset
pre_mask[prop_name][flow_idx[i]] = (mask)
# concatenate and residual blocks
feat = [feat_current] + [
feats[k][idx]
for k in feats if k not in ['spatial', prop_name]
] + [feat_prop]
feat = paddle.concat(feat, axis=1)
feat_prop = feat_prop + self.backbone[prop_name](feat)
feats[prop_name].append(feat_prop)
if 'backward' in prop_name:
feats[prop_name] = feats[prop_name][::-1]
return feats, pre_offset, pre_mask
def stage3(self,
feats,
flows,
aux_feats=None,
pre_offset=None,
pre_mask=None):
"""Stage3 of PP-MSVSR network.
Args:
feats (dict): Dict of features after stage2.
flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
Returns:
feats (dict): The input feats dict with new keys 'stage3_backward' and 'stage3_forward', the value of both is Array of feature after stage3 with shape (n, c, h, w).
"""
flows_backward, flows_forward = flows
n, t, _, h, w = flows_backward.shape
# propagation branches module
for prop_name in ['stage3_backward', 'stage3_forward']:
feats[prop_name] = []
frame_idx = range(0, t + 1)
flow_idx = range(-1, t)
mapping_idx = list(range(0, len(feats['spatial'])))
mapping_idx += mapping_idx[::-1]
if 'backward' in prop_name:
frame_idx = frame_idx[::-1]
flow_idx = frame_idx
flows = flows_backward
pre_stage_name = 'stage2_backward'
else:
flows = flows_forward
pre_stage_name = 'stage2_forward'
feat_prop = paddle.zeros([n, self.mid_channels, h, w])
for i, idx in enumerate(frame_idx):
feat_current = feats['spatial'][mapping_idx[idx]]
if aux_feats is not None and 'feats' in aux_feats:
feat_current = aux_feats['feats'][mapping_idx[idx]]
if i > 0:
flow_n1 = flows[:, flow_idx[i], :, :, :]
cond_n1 = flow_warp(feat_prop,
flow_n1.transpose([0, 2, 3, 1]))
cond = paddle.concat([cond_n1, feat_current], axis=1)
feat_prop = self.deform_align[prop_name](
feat_prop, cond, flow_n1, feat_current,
pre_offset[pre_stage_name][flow_idx[i]],
pre_mask[pre_stage_name][flow_idx[i]])
# concatenate and residual blocks
feat = [feat_current] + [
feats[k][idx]
for k in feats if k not in ['spatial', prop_name]
] + [feat_prop]
feat = paddle.concat(feat, axis=1)
feat_prop = feat_prop + self.backbone[prop_name](feat)
feats[prop_name].append(feat_prop)
if 'backward' in prop_name:
feats[prop_name] = feats[prop_name][::-1]
return feats
def auxiliary_stage(self, feats, lqs):
"""Compute the output image and auxiliary feature for Auxiliary Loss in stage2.
Args:
feats (dict): Dict of features after stage2.
lqs (tensor): Input LR images with shape (n, t, c, h, w)
Returns:
dict: Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
"""
aux_feats = {}
aux_feats['outs'] = []
aux_feats['feats'] = []
num_outputs = len(feats['spatial'])
mapping_idx = list(range(0, num_outputs))
mapping_idx += mapping_idx[::-1]
for i in range(0, lqs.shape[1]):
hr = [feats[k][i] for k in feats if (k != 'spatial')]
feat_current = feats['spatial'][mapping_idx[i]]
hr.insert(0, feat_current)
hr = paddle.concat(hr, axis=1)
hr_low = self.aux_reconstruction(hr)
hr_mid = self.lrelu(self.aux_upsample1(hr_low))
hr_high = self.lrelu(self.aux_upsample2(hr_mid))
hr = self.aux_conv_last(hr_high)
hr += self.img_upsample(lqs[:, i, :, :, :])
# output tensor of auxiliary_stage with shape (n, 3, 4*h, 4*w)
aux_feats['outs'].append(hr)
aux_feat = self.aux_block_down1(paddle.concat([hr, hr_high],
axis=1))
aux_feat = self.aux_block_down2(
paddle.concat([aux_feat, hr_mid], axis=1))
aux_feat = self.aux_fusion(paddle.concat([aux_feat, hr_low],
axis=1))
# out feature of auxiliary_stage with shape (n, c, h, w)
aux_feats['feats'].append(aux_feat)
return aux_feats
def upsample(self, lqs, feats, aux_feats=None):
"""Compute the output image given the features.
Args:
lqs (tensor): Input LR images with shape (n, t, c, h, w).
feats (dict): Dict of features after stage3.
aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
Returns:
Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
"""
outputs = []
outputs_head = []
num_outputs = len(feats['spatial'])
mapping_idx = list(range(0, num_outputs))
mapping_idx += mapping_idx[::-1]
cas_outs = []
pas = []
hrs = []
for i in range(0, lqs.shape[1]):
hr = [
feats[k].pop(0) for k in feats
if (k != 'spatial' and k != 'feat_stage1')
]
if 'feat_stage1' in feats:
local_feat = feats['feat_stage1'].pop(0)
hr.insert(0, local_feat)
hr.insert(0, feats['spatial'][mapping_idx[i]])
hr = paddle.concat(hr, axis=1)
hr = self.reconstruction(hr)
hr = self.lrelu(self.upsample1(hr))
hr = self.lrelu(self.upsample2(hr))
if self.only_last:
hr = self.conv_last(hr)
else:
hr = self.lrelu(self.conv_hr(hr))
hr = self.conv_last(hr)
hr += self.img_upsample(lqs[:, i, :, :, :])
if self.use_local_connnect:
local_head = self.lrelu(self.aux_upsample1(local_feat))
local_head = self.lrelu(self.aux_upsample2(local_head))
hr = self.hybrid_conv_last(local_head) + hr
outputs.append(hr)
if self.auxiliary_loss:
return paddle.stack(aux_feats['outs'],
axis=1), paddle.stack(outputs, axis=1)
return paddle.stack(outputs, axis=1)
def forward(self, lqs):
"""Forward function for PP-MSVSR.
Args:
lqs (Tensor): Input LR sequence with shape (n, t, c, h, w).
Returns:
Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
"""
n, t, c, h, w = lqs.shape
lqs_downsample = lqs
# check whether the input is an extended sequence
self.check_if_mirror_extended(lqs)
feats = {}
feats_ = self.feat_extract(lqs.reshape([-1, c, h, w]))
h, w = feats_.shape[2:]
feats_ = feats_.reshape([n, t, -1, h, w])
feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
# compute optical flow using the low-res inputs
assert lqs_downsample.shape[3] >= 64 and lqs_downsample.shape[4] >= 64, (
'The height and width of low-res inputs must be at least 64, '
f'but got {h} and {w}.')
flows_forward, flows_backward = self.compute_flow(lqs_downsample)
# feature propgation
feats['feat_stage1'] = []
feats = self.stage1(feats, flows_backward, flows_forward)
feats, pre_offset, pre_mask = self.stage2(
feats, (flows_backward, flows_forward))
if self.auxiliary_loss:
aux_feats = self.auxiliary_stage(feats, lqs)
feats = self.stage3(feats, (flows_backward, flows_forward), aux_feats,
pre_offset, pre_mask)
return self.upsample(lqs, feats, aux_feats=aux_feats)
class AlignmentModule(nn.Layer):
"""deformable alignment module.
Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int or tuple[int]): Same as nn.Conv2d.
padding (int or tuple[int]): Same as nn.Conv2d.
dilation (int or tuple[int]): Same as nn.Conv2d.
groups (int): Same as nn.Conv2d.
deformable_groups (int): Number of deformable_groups in DeformConv2D.
"""
def __init__(self,
in_channels=128,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
dilation=1,
groups=1,
deformable_groups=16):
super(AlignmentModule, self).__init__()
self.conv_offset = nn.Sequential(
nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
)
self.dcn = DeformConv2D(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
deformable_groups=deformable_groups)
self.init_offset()
def init_offset(self):
constant_(self.conv_offset[-1].weight, 0)
constant_(self.conv_offset[-1].bias, 0)
def forward(self, x, extra_feat, flow_1):
extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
out = self.conv_offset(extra_feat)
o1, o2, mask = paddle.chunk(out, 3, axis=1)
# offset
offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
offset = offset + flow_1.flip(1).tile([1, offset.shape[1] // 2, 1, 1])
# mask
mask = F.sigmoid(mask)
out = self.dcn(x, offset, mask)
return out, offset, mask
class ReAlignmentModule(nn.Layer):
"""refine deformable alignment module.
Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int or tuple[int]): Same as nn.Conv2d.
padding (int or tuple[int]): Same as nn.Conv2d.
dilation (int or tuple[int]): Same as nn.Conv2d.
groups (int): Same as nn.Conv2d.
deformable_groups (int): Number of deformable_groups in DeformConv2D.
"""
def __init__(self,
in_channels=128,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
dilation=1,
groups=1,
deformable_groups=16):
super(ReAlignmentModule, self).__init__()
self.mdconv = DeformConv2D(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
deformable_groups=deformable_groups)
self.conv_offset = nn.Sequential(
nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, out_channels, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.1),
nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
)
self.dcn = DeformConv2D(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
deformable_groups=deformable_groups)
self.init_offset()
def init_offset(self):
constant_(self.conv_offset[-1].weight, 0)
constant_(self.conv_offset[-1].bias, 0)
def forward(self,
x,
extra_feat,
flow_1,
feat_current,
pre_stage_flow=None,
pre_stage_mask=None):
if pre_stage_flow is not None:
pre_feat = self.mdconv(x, pre_stage_flow, pre_stage_mask)
extra_feat = paddle.concat([pre_feat, feat_current, flow_1], axis=1)
else:
extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
out = self.conv_offset(extra_feat)
o1, o2, mask = paddle.chunk(out, 3, axis=1)
# offset
offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
if pre_stage_flow is not None:
offset = offset + pre_stage_flow
else:
offset = offset + flow_1.flip(1).tile(
[1, offset.shape[1] // 2, 1, 1])
# mask
if pre_stage_mask is not None:
mask = (F.sigmoid(mask) + pre_stage_mask) / 2.0
else:
mask = F.sigmoid(mask)
out = self.dcn(x, offset, mask)
return out
class ModifiedSPyNet(nn.Layer):
"""Modified SPyNet network structure.
The difference to the SPyNet in paper is that
1. convolution with kernel_size=7 is replaced by convolution with kernel_size=3 in this version,
2. less SPyNetBasicModule is used in this version,
3. no BN is used in this version.
Paper:
Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
Args:
act_cfg (dict): Activation function.
Default: dict(name='LeakyReLU').
num_blocks (int): Number of SPyNetBlock.
Default: 6.
use_tiny_block (bool): Whether use tiny spynet.
Default: True.
"""
def __init__(self,
act_cfg=dict(name='LeakyReLU'),
num_blocks=6,
use_tiny_block=False):
super().__init__()
self.num_blocks = num_blocks
self.basic_module = nn.LayerList([
SPyNetBlock(act_cfg=act_cfg, use_tiny_block=use_tiny_block)
for _ in range(num_blocks)
])
self.register_buffer(
'mean',
paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
self.register_buffer(
'std',
paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
def compute_flow(self, ref, supp):
"""Compute flow from ref to supp.
Note that in this function, the images are already resized to a
multiple of 32.
Args:
ref (Tensor): Reference image with shape of (n, 3, h, w).
supp (Tensor): Supporting image with shape of (n, 3, h, w).
Returns:
Tensor: Estimated optical flow: (n, 2, h, w).
"""
n, _, h, w = ref.shape
# normalize the input images
ref = [(ref - self.mean) / self.std]
supp = [(supp - self.mean) / self.std]
# generate downsampled frames
for level in range(self.num_blocks - 1):
ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
ref = ref[::-1]
supp = supp[::-1]
# flow computation
flow = paddle.to_tensor(
np.zeros([
n, 2, h // (2**(self.num_blocks - 1)), w //
(2**(self.num_blocks - 1))
], 'float32'))
for level in range(len(ref)):
if level == 0:
flow_up = flow
else:
flow_up = F.interpolate(
flow, scale_factor=2, mode='bilinear',
align_corners=True) * 2.0
# add the residue to the upsampled flow
flow = flow_up + self.basic_module[level](paddle.concat([
ref[level],
flow_warp(supp[level],
flow_up.transpose([0, 2, 3, 1]),
padding_mode='border'), flow_up
],
axis=1))
return flow
def compute_flow_list(self, ref, supp):
n, _, h, w = ref.shape
# normalize the input images
ref = [(ref - self.mean) / self.std]
supp = [(supp - self.mean) / self.std]
# generate downsampled frames
for level in range(self.num_blocks - 1):
ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
ref = ref[::-1]
supp = supp[::-1]
# flow computation
flow_list = []
flow = paddle.to_tensor(
np.zeros([
n, 2, h // (2**(self.num_blocks - 1)), w //
(2**(self.num_blocks - 1))
], 'float32'))
for level in range(len(ref)):
if level == 0:
flow_up = flow
else:
flow_up = F.interpolate(
flow, scale_factor=2, mode='bilinear',
align_corners=True) * 2.0
# add the residue to the upsampled flow
flow = flow_up + self.basic_module[level](paddle.concat([
ref[level],
flow_warp(supp[level],
flow_up.transpose([0, 2, 3, 1]),
padding_mode='border'), flow_up
],
axis=1))
flow_list.append(flow)
return flow_list
def forward(self, ref, supp):
"""Forward function of Modified SPyNet.
This function computes the optical flow from ref to supp.
Args:
ref (Tensor): Reference image with shape of (n, 3, h, w).
supp (Tensor): Supporting image with shape of (n, 3, h, w).
Returns:
Tensor: Estimated optical flow: (n, 2, h, w).
"""
# upsize to a multiple of 32
h, w = ref.shape[2:4]
w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
ref = F.interpolate(ref,
size=(h_up, w_up),
mode='bilinear',
align_corners=False)
supp = F.interpolate(supp,
size=(h_up, w_up),
mode='bilinear',
align_corners=False)
ref.stop_gradient = False
supp.stop_gradient = False
# compute flow, and resize back to the original resolution
flow = F.interpolate(self.compute_flow(ref, supp),
size=(h, w),
mode='bilinear',
align_corners=False)
# adjust the flow values
flow[:, 0, :, :] *= float(w) / float(w_up)
flow[:, 1, :, :] *= float(h) / float(h_up)
return flow
class SPyNetBlock(nn.Layer):
"""Basic Block of Modified SPyNet.
refer to Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
"""
def __init__(self, act_cfg=dict(name='LeakyReLU'), use_tiny_block=False):
super().__init__()
if use_tiny_block:
self.basic_module = nn.Sequential(
ConvLayer(in_channels=8,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=8,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=8,
out_channels=8,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=8,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
act_cfg=None))
else:
self.basic_module = nn.Sequential(
ConvLayer(in_channels=8,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=64,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=32,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
act_cfg=act_cfg),
ConvLayer(in_channels=16,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
act_cfg=None))
def forward(self, tensor_input):
"""Forward function of SPyNetBlock.
Args:
tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
8 channels contain:
[reference image (3), neighbor image (3), initial flow (2)].
Returns:
Tensor: Refined flow with shape (b, 2, h, w)
"""
return self.basic_module(tensor_input)
class ConvLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
act_cfg=dict(name='ReLU')):
super(ConvLayer, self).__init__()
self.act_cfg = act_cfg
self.with_activation = act_cfg is not None
self.conv = nn.Conv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups)
if self.with_activation:
if act_cfg['name'] == 'ReLU':
self.act = paddle.nn.ReLU()
elif act_cfg['name'] == 'LeakyReLU':
self.act = nn.LeakyReLU(negative_slope=0.1)
def forward(self, tensor_input):
out = self.conv(tensor_input)
if self.with_activation:
out = self.act(out)
return out
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from .builder import MODELS
from .sr_model import BaseSRModel
from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
from .generators.msvsr import ModifiedSPyNet
from ..modules.init import reset_parameters
from ..utils.visual import tensor2img
@MODELS.register()
class MultiStageVSRModel(BaseSRModel):
"""PP-MSVSR Model.
Paper:
PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
"""
def __init__(self, generator, fix_iter, pixel_criterion=None):
"""Initialize the PP-MSVSR class.
Args:
generator (dict): config of generator.
fix_iter (dict): config of fix_iter.
pixel_criterion (dict): config of pixel criterion.
"""
super(MultiStageVSRModel, self).__init__(generator, pixel_criterion)
self.fix_iter = fix_iter
self.current_iter = 1
self.flag = True
init_basicvsr_weight(self.nets['generator'])
if not self.fix_iter:
print('init train all parameters!!!')
for name, param in self.nets['generator'].named_parameters():
param.trainable = True
if 'spynet' in name:
param.optimize_attr['learning_rate'] = 0.25
def setup_input(self, input):
self.lq = paddle.to_tensor(input['lq'])
self.visual_items['lq'] = self.lq[:, 0, :, :, :]
if 'gt' in input:
self.gt = paddle.to_tensor(input['gt'])
self.visual_items['gt'] = self.gt[:, 0, :, :, :]
self.image_paths = input['lq_path']
def train_iter(self, optims=None):
optims['optim'].clear_grad()
if self.fix_iter:
if self.current_iter == 1:
print('Train MSVSR with fixed spynet for', self.fix_iter,
'iters.')
for name, param in self.nets['generator'].named_parameters():
if 'spynet' in name:
param.trainable = False
elif self.current_iter >= self.fix_iter + 1 and self.flag:
print('Train all the parameters.')
for name, param in self.nets['generator'].named_parameters():
param.trainable = True
if 'spynet' in name:
param.optimize_attr['learning_rate'] = 0.25
self.flag = False
for net in self.nets.values():
net.find_unused_parameters = False
output = self.nets['generator'](self.lq)
if isinstance(output, (list, tuple)):
out_stage2, output = output
loss_pix_stage2 = self.pixel_criterion(out_stage2, self.gt)
self.losses['loss_pix_stage2'] = loss_pix_stage2
self.visual_items['output'] = output[:, 0, :, :, :]
# pixel loss
loss_pix = self.pixel_criterion(output, self.gt)
self.losses['loss_pix'] = loss_pix
self.loss = sum(_value for _key, _value in self.losses.items()
if 'loss_pix' in _key)
self.losses['loss'] = self.loss
self.loss.backward()
optims['optim'].step()
self.current_iter += 1
def test_iter(self, metrics=None):
self.gt = self.gt.cpu()
self.nets['generator'].eval()
with paddle.no_grad():
output = self.nets['generator'](self.lq)
if isinstance(output, (list, tuple)):
out_stage1, output = output
self.nets['generator'].train()
out_img = []
gt_img = []
_, t, _, _, _ = self.gt.shape
for i in range(t):
out_tensor = output[0, i]
gt_tensor = self.gt[0, i]
out_img.append(tensor2img(out_tensor, (0., 1.)))
gt_img.append(tensor2img(gt_tensor, (0., 1.)))
if metrics is not None:
for metric in metrics.values():
metric.update(out_img, gt_img, is_seq=True)
def init_basicvsr_weight(net):
for m in net.children():
if hasattr(m,
'weight') and not isinstance(m,
(nn.BatchNorm, nn.BatchNorm2D)):
reset_parameters(m)
continue
if (not isinstance(
m,
(ResidualBlockNoBN, PixelShufflePack, SPyNet, ModifiedSPyNet))):
init_basicvsr_weight(m)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册