Add msvsr (#496)

* fix benchmark dataset * fix edvr and basic bug * add PP-MSVSR * add experiment results in docs * fix ssim * modif * modif2

Add msvsr (#496)
* fix benchmark dataset * fix edvr and basic bug * add PP-MSVSR * add experiment results in docs * fix ssim * modif * modif2
8aafe376 · wangna11BD · GitHub · 6a31877f · 8aafe376 · 8aafe376
21 changed file
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -39,7 +39,7 @@ run_cmd="set -xe;

 nvidia-docker run --name test_paddlegan -i  \
    --net=host \
-    --shm-size=1g \
+    --shm-size=128g \
    -v $PWD:/workspace \
    ${ImageName}  /bin/bash -c "${run_cmd}"
 ```

--- a/configs/basicvsr++_vimeo90k_BD.yaml
+++ b/configs/basicvsr++_vimeo90k_BD.yaml
@@ -59,7 +59,7 @@ dataset:

  test:
    name: VSRFolderDataset
-    # for udm10 dataset
+    # for UDM10 dataset
    # lq_folder: data/udm10/BDx4
    # gt_folder: data/udm10/GT
    lq_folder: data/Vid4/BDx4
@@ -67,7 +67,7 @@ dataset:
    preprocess:
        - name: GetNeighboringFramesIdx
          interval_list: [1]
-          # for udm10 dataset
+          # for UDM10 dataset
          # filename_tmpl: '{:04d}.png'
          filename_tmpl: '{:08d}.png'
        - name: ReadImageSequence

--- a/configs/basicvsr_reds.yaml
+++ b/configs/basicvsr_reds.yaml
@@ -23,8 +23,8 @@ dataset:
  train:
    name: RepeatDataset
    times: 1000
-    num_workers: 4  # 6
-    batch_size: 2  # 4*2
+    num_workers: 4
+    batch_size: 2  #4 GPUs
    dataset:
      name: SRREDSMultipleGTDataset
      mode: train

--- a/configs/edvr_l_w_tsa.yaml
+++ b/configs/edvr_l_w_tsa.yaml
@@ -43,7 +43,7 @@ dataset:
    scale: 4
    fix_random_seed: 10
    num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs


  test:

--- a/configs/edvr_l_wo_tsa.yaml
+++ b/configs/edvr_l_wo_tsa.yaml
@@ -42,7 +42,7 @@ dataset:
    scale: 4
    fix_random_seed: 10
    num_workers: 3
-    batch_size: 4  # 8GUPs
+    batch_size: 4  # 8GPUs


  test:

--- a/configs/edvr_m_w_tsa.yaml
+++ b/configs/edvr_m_w_tsa.yaml
@@ -46,7 +46,7 @@ dataset:
    scale: 4
    fix_random_seed: 10
    num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs


  test:

--- a/configs/edvr_m_wo_tsa.yaml
+++ b/configs/edvr_m_wo_tsa.yaml
@@ -42,7 +42,7 @@ dataset:
    scale: 4
    fix_random_seed: 10
    num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs


  test:

--- a/configs/iconvsr_reds.yaml
+++ b/configs/iconvsr_reds.yaml
@@ -23,8 +23,8 @@ dataset:
  train:
    name: RepeatDataset
    times: 1000
-    num_workers: 4  # 6
-    batch_size: 2  # 4*2
+    num_workers: 4
+    batch_size: 2  #4 GPUs
    dataset:
      name: SRREDSMultipleGTDataset
      mode: train

--- a/configs/lapstyle_draft.yaml
+++ b/configs/lapstyle_draft.yaml
@@ -32,7 +32,7 @@ dataset:
    load_size: 136
    crop_size: 128
    num_workers: 16
-    batch_size: 5
+    batch_size: 5  #1 GPUs
  test:
    name: LapStyleDataset
    content_root: data/coco/test2017/

--- a/configs/lapstyle_rev_first.yaml
+++ b/configs/lapstyle_rev_first.yaml
@@ -38,7 +38,7 @@ dataset:
    load_size: 280
    crop_size: 256
    num_workers: 16
-    batch_size: 5
+    batch_size: 5 #1 GPUs
  test:
    name: LapStyleDataset
    content_root: data/coco/test2017/

--- a/configs/lapstyle_rev_second.yaml
+++ b/configs/lapstyle_rev_second.yaml
@@ -38,7 +38,7 @@ dataset:
    load_size: 540
    crop_size: 512
    num_workers: 16
-    batch_size: 2
+    batch_size: 2 #1 GPUs
  test:
    name: LapStyleDataset
    content_root: data/coco/test2017/

--- a/configs/msvsr_l_reds.yaml
+++ b/configs/msvsr_l_reds.yaml
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: 2500
+  generator:
+    name: MSVSR
+    mid_channels: 64
+    num_init_blocks: 5
+    num_blocks: 7
+    num_reconstruction_blocks: 5
+    only_last: False
+    use_tiny_spynet: False
+    deform_groups: 8
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 2
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 4
+    batch_size: 2  #8 gpus
+    use_shared_memory: True
+    dataset:
+      name: SRREDSMultipleGTDataset
+      mode: train
+      lq_folder: data/REDS/train_sharp_bicubic/X4
+      gt_folder: data/REDS/train_sharp/X4
+      crop_size: 256
+      interval_list: [1]
+      random_reverse: False
+      number_frames: 30
+      use_flip: True
+      use_rot: True
+      scale: 4
+      val_partition: REDS4
+
+  test:
+    name: SRREDSMultipleGTDataset
+    mode: test
+    lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+    gt_folder: data/REDS/REDS4_test_sharp/X4
+    interval_list: [1]
+    random_reverse: False
+    number_frames: 100
+    use_flip: False
+    use_rot: False
+    scale: 4
+    val_partition: REDS4
+    num_workers: 0
+    batch_size: 1
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [300000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 5000
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: false
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: false
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 5000
--- a/configs/msvsr_reds.yaml
+++ b/configs/msvsr_reds.yaml
+total_iters: 150000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: 2500
+  generator:
+    name: MSVSR
+    mid_channels: 32
+    num_init_blocks: 2
+    num_blocks: 3
+    num_reconstruction_blocks: 2
+    only_last: True
+    use_tiny_spynet: True
+    deform_groups: 4
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 1
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 6
+    batch_size: 2  #8 gpus
+    use_shared_memory: True
+    dataset:
+      name: SRREDSMultipleGTDataset
+      mode: train
+      lq_folder: data/REDS/train_sharp_bicubic/X4
+      gt_folder: data/REDS/train_sharp/X4
+      crop_size: 256
+      interval_list: [1]
+      random_reverse: False
+      number_frames: 20
+      use_flip: True
+      use_rot: True
+      scale: 4
+      val_partition: REDS4
+
+  test:
+    name: SRREDSMultipleGTDataset
+    mode: test
+    lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+    gt_folder: data/REDS/REDS4_test_sharp/X4
+    interval_list: [1]
+    random_reverse: False
+    number_frames: 100
+    use_flip: False
+    use_rot: False
+    scale: 4
+    val_partition: REDS4
+    num_workers: 0
+    batch_size: 1
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [150000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 5000
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: false
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: false
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 5000
--- a/configs/msvsr_vimeo90k_BD.yaml
+++ b/configs/msvsr_vimeo90k_BD.yaml
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: -1
+  generator:
+    name: MSVSR
+    mid_channels: 32
+    num_init_blocks: 2
+    num_blocks: 3
+    num_reconstruction_blocks: 2
+    only_last: True
+    use_tiny_spynet: True
+    deform_groups: 4
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 1
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 4
+    batch_size: 2  #8 gpus
+    dataset:
+      name: VSRVimeo90KDataset
+      # mode: train
+      lq_folder: data/vimeo90k/vimeo_septuplet_BD_matlabLRx4/sequences
+      gt_folder: data/vimeo90k/vimeo_septuplet/sequences
+      ann_file: data/vimeo90k/vimeo_septuplet/sep_trainlist.txt
+      preprocess:
+        - name: ReadImageSequence
+          key: lq
+        - name: ReadImageSequence
+          key: gt
+        - name: Transforms
+          input_keys: [lq, gt]
+          pipeline:
+            - name: SRPairedRandomCrop
+              gt_patch_size: 256
+              scale: 4
+              keys: [image, image]
+            - name: PairedRandomHorizontalFlip
+              keys: [image, image]
+            - name: PairedRandomVerticalFlip
+              keys: [image, image]
+            - name: PairedRandomTransposeHW
+              keys: [image, image]
+            - name: TransposeSequence
+              keys: [image, image]
+            - name: MirrorVideoSequence
+            - name: NormalizeSequence
+              mean: [0., .0, 0.]
+              std: [255., 255., 255.]
+              keys: [image, image]
+
+  test:
+    name: VSRFolderDataset
+    # for udm10 dataset
+    # lq_folder: data/udm10/BDx4
+    # gt_folder: data/udm10/GT
+    lq_folder: data/Vid4/BDx4
+    gt_folder: data/Vid4/GT
+    preprocess:
+        - name: GetNeighboringFramesIdx
+          interval_list: [1]
+          # for udm10 dataset
+          # filename_tmpl: '{:04d}.png'
+          filename_tmpl: '{:08d}.png'
+        - name: ReadImageSequence
+          key: lq
+        - name: ReadImageSequence
+          key: gt
+        - name: Transforms
+          input_keys: [lq, gt]
+          pipeline:
+            - name: TransposeSequence
+              keys: [image, image]
+            - name: NormalizeSequence
+              mean: [0., .0, 0.]
+              std: [255., 255., 255.]
+              keys: [image, image]
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [300000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 2500
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: true
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: true
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 2500
--- a/docs/en_US/tutorials/video_super_resolution.md
+++ b/docs/en_US/tutorials/video_super_resolution.md
@@ -3,15 +3,22 @@

 ## 1.1 Principle

-  Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf).[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
+  Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf), [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf), and PP-MSVSR.

  [EDVR](https://arxiv.org/pdf/1905.02716.pdf) wins the champions and outperforms the second place by a large margin in all four tracks in the NTIRE19 video restoration and enhancement challenges. The main difficulties of video super-resolution from two aspects: (1) how to align multiple frames given large motions, and (2) how to effectively fuse different frames with diverse motion and blur. First, to handle large motions, EDVR devise a Pyramid, Cascading and Deformable (PCD) alignment module, in which frame alignment is done at the feature level using deformable convolutions in a coarse-to-fine manner. Second, EDVR propose a Temporal and Spatial Attention (TSA) fusion module, in which attention is applied both temporally and spatially, so as to emphasize important features for subsequent restoration.

+  [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf) reconsiders some most essential components for VSR guided by four basic functionalities, i.e., Propagation, Alignment, Aggregation, and Upsampling. By reusing some existing components added with minimal redesigns, a succinct pipeline, BasicVSR, achieves appealing improvements in terms of speed and restoration quality in comparison to many state-of-the-art algorithms. By presenting an informationrefill mechanism and a coupled propagation scheme to facilitate information aggregation, the BasicVSR can be expanded to [IconVSR](https://arxiv.org/pdf/2012.02181.pdf), which can serve as strong baselines for future VSR approaches.
+
+  [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf) redesign BasicVSR by proposing second-order grid propagation and flowguided deformable alignment. By empowering the recurrent framework with the enhanced propagation and alignment, BasicVSR++ can exploit spatiotemporal information across misaligned video frames more effectively. The new components lead to an improved performance under a similar computational constraint. In particular, BasicVSR++ surpasses BasicVSR by 0.82 dB in PSNR with similar number of parameters. In NTIRE 2021, BasicVSR++ obtains three champions and one runner-up in the Video Super-Resolution and Compressed Video Enhancement Challenges.
+
+  PP-MSVSR is a multi-stage VSR deep architecture,  with local fusion module, auxiliary loss and refined align module to refine the enhanced result progressively. Specifically, in order to strengthen the fusion of features across frames in feature propagation, a local fusion module is designed in stage-1 to perform local feature fusion before feature propagation. Moreover, an auxiliary loss in stage-2 is introduced to make the features obtained by the propagation module reserve more correlated information connected to the HR space, and introduced a refined align module in stage-3 to make full use of the feature information of the previous stage. Extensive experiments substantiate that PP-MSVSR achieves a promising performance of Vid4 datasets, which PSNR metric can achieve 28.13 with only 1.45M parameters.
+


 ## 1.2 How to use  

 ### 1.2.1 Prepare Datasets
+  Here are 4 commonly used video super-resolution dataset, REDS, Vimeo90K, Vid4, UDM10. The REDS and Vimeo90K dataset include train dataset and test dataset, Vid4 and UDM10 are test dataset. Download and decompress the required dataset and place it under the ``PaddleGAN/data``.

  REDS（[download](https://seungjunnah.github.io/Datasets/reds.html)）is a newly proposed high-quality (720p) video dataset in the NTIRE19 Competition. REDS consists of 240 training clips, 30 validation clips and 30 testing clips (each with 100 consecutive frames). Since the test ground truth is not available, we select four representative clips (they are '000', '011', '015', '020', with diverse scenes and motions) as our test set, denoted by REDS4. The remaining training and validation clips are re-grouped as our training dataset (a total of 266 clips).

@@ -31,6 +38,49 @@
              ...
  ```

+  Vimeo90K ([download](http://toflow.csail.mit.edu/)) is designed by Tianfan Xue etc. for the following four video processing tasks: temporal frame interpolation, video denoising, video deblocking, and video super-resolution. Vimeo90K is a large-scale, high-quality video dataset. This dataset consists of 89,800 video clips downloaded from vimeo.com, which covers large variaty of scenes and actions.
+
+  The structure of the processed Vimeo90K is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vimeo90K
+                ├── vimeo_septuplet
+                |    |──sequences
+                |    └──sep_trainlist.txt
+                ├── vimeo_septuplet_BD_matlabLRx4
+                |    └──sequences
+                └── vimeo_super_resolution_test
+                     |──low_resolution
+                     |──target
+                     └──sep_testlist.txt
+              ...
+  ```
+
+  Vid4 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)) is a commonly used test dataset for VSR, which contains 4 video segments.
+  The structure of the processed Vid4 is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vid4
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+  UDM10 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)) is a commonly used test dataset for VSR, which contains 10 video segments.
+  The structure of the processed UDM10 is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── udm10
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+
+
 ### 1.2.2 Train/Test

  According to the number of channels, EDVR are divided into EDVR_L(128 channels) and EDVR_M (64 channels). Then, taking EDVR_M as an example, the model training and testing are introduced.
@@ -63,24 +113,37 @@
     python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
  ```

+  To train or test other VSR model, you can find the config file of the corresponding VSR model in the ``PaddleGAN/configs``, then change the config file in the command to the config file of corresponding VSR model.
+

 ## 1.3 Results
 The experimental results are evaluated on RGB channel.

 The metrics are PSNR / SSIM.

+VSR quantitative comparis on the test dataset REDS4 from REDS dataset
+| Method | Paramete(M) | FLOPs(G) | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4  | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4  | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4  | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4  | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4  | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4  | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4  | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4  | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4  | 7.42 | 543 | 32.5321 / 0.9083 |
+
+Deblur quantitative comparis on the test dataset REDS4 from REDS dataset
 | Method | REDS4 |
 |---|---|
-| EDVR_M_wo_tsa_SRx4  | 30.4429 / 0.8684 |
-| EDVR_M_w_tsa_SRx4  | 30.5169 / 0.8699 |
-| EDVR_L_wo_tsa_SRx4  | 30.8649 / 0.8761 |
-| EDVR_L_w_tsa_SRx4  | 30.9336 / 0.8773 |
 | EDVR_L_wo_tsa_deblur  | 34.9587 / 0.9509 |
 | EDVR_L_w_tsa_deblur  | 35.1473 / 0.9526 |
-| BasicVSR_x4  | 31.4325 / 0.8913 |
-| IconVSR_x4  | 31.6882 / 0.8950 |
-| BasicVSR++_x4  | 32.4018 / 0.9071 |

+VSR quantitative comparis on the Vimeo90K, Vid4, UDM10
+| Model | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|

 ## 1.4 Model Download
 | Method | Dataset | Download Link |
@@ -94,7 +157,9 @@ The metrics are PSNR / SSIM.
 | BasicVSR_x4  | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
 | IconVSR_x4  | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
 | BasicVSR++_x4  | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
-
+| PP-MSVSR_reds_x4  | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4  | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4  | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)



@@ -133,3 +198,10 @@ The metrics are PSNR / SSIM.
    year = {2021}
    }
  ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
+
+  ```
+  @article{
+    }
+  ```
--- a/docs/zh_CN/tutorials/video_super_resolution.md
+++ b/docs/zh_CN/tutorials/video_super_resolution.md
@@ -3,16 +3,22 @@

 ## 1.1 原理介绍

-  视频超分源于图像超分，其目的是从一个或多个低分辨率（LR）图像中恢复高分辨率（HR）图像。它们的区别也很明显，由于视频是由多个帧组成的，所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf),[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
+  视频超分源于图像超分，其目的是从一个或多个低分辨率（LR）图像中恢复高分辨率（HR）图像。它们的区别也很明显，由于视频是由多个帧组成的，所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf)，[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)，[IconVSR](https://arxiv.org/pdf/2012.02181.pdf)，[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)和PP-MSVSR。

  [EDVR](https://arxiv.org/pdf/1905.02716.pdf)模型在NTIRE19视频恢复和增强挑战赛的四个赛道中都赢得了冠军，并以巨大的优势超过了第二名。视频超分的主要难点在于（1）如何在给定大运动的情况下对齐多个帧；（2）如何有效地融合具有不同运动和模糊的不同帧。首先，为了处理大的运动，EDVR模型设计了一个金字塔级联的可变形（PCD）对齐模块，在该模块中，从粗到精的可变形卷积被使用来进行特征级的帧对齐。其次，EDVR使用了时空注意力（TSA）融合模块，该模块在时间和空间上同时应用注意力机制，以强调后续恢复的重要特征。

+  [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)在VSR的指导下重新考虑了四个基本模块（即传播、对齐、聚合和上采样）的一些最重要的组件。 通过添加一些小设计，重用一些现有组件，得到了简洁的 BasicVSR。与许多最先进的算法相比，BasicVSR在速度和恢复质量方面实现了有吸引力的改进。 同时，通过添加信息重新填充机制和耦合传播方案以促进信息聚合，BasicVSR 可以扩展为 [IconVSR](https://arxiv.org/pdf/2012.02181.pdf)，IconVSR可以作为未来 VSR 方法的强大基线 .

+  [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)通过提出二阶网格传播和导流可变形对齐来重新设计BasicVSR。通过增强传播和对齐来增强循环框架，BasicVSR++可以更有效地利用未对齐视频帧的时空信息。 在类似的计算约束下，新组件可提高性能。特别是，BasicVSR++ 以相似的参数数量在 PSNR 方面比 BasicVSR 高0.82dB。BasicVSR++ 在NTIRE2021的视频超分辨率和压缩视频增强挑战赛中获得三名冠军和一名亚军。
+
+  PP-MSVSR是一种多阶段视频超分深度架构，具有局部融合模块、辅助损失和细化对齐模块，以逐步细化增强结果。具体来说，在第一阶段设计了局部融合模块，在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失，使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块，以充分利用前一阶段传播模块的特征信息。大量实验证实，PP-MSVSR在Vid4数据集性能优异，仅使用 1.45M 参数PSNR指标即可达到28.13dB。

 ## 1.2 如何使用

 ### 1.2.1 数据准备

+  这里提供4个视频超分辨率常用数据集，REDS，Vimeo90K，Vid4，UDM10。其中REDS和vimeo90k数据集包括训练集和测试集，Vid4和UDM10为测试数据集。将需要的数据集下载解压后放到``PaddleGAN/data``文件夹下 。
+
  REDS（[数据下载](https://seungjunnah.github.io/Datasets/reds.html)）数据集是NTIRE19公司最新提出的高质量（720p）视频数据集，其由240个训练片段、30个验证片段和30个测试片段组成（每个片段有100个连续帧）。由于测试数据集不可用，这里在训练集选择了四个具有代表性的片段（分别为'000', '011', '015', '020'，它们具有不同的场景和动作）作为测试集，用REDS4表示。剩下的训练和验证片段被重新分组为训练数据集（总共266个片段）。

  处理后的数据集 REDS 的组成形式如下:
@@ -31,6 +37,49 @@
              ...
  ```

+  Vimeo90K（[数据下载](http://toflow.csail.mit.edu/)）数据集是Tianfan Xue等人构建的一个用于视频超分、视频降噪、视频去伪影、视频插帧的数据集。Vimeo90K是大规模、高质量的视频数据集，包含从vimeo.com下载的 89,800 个视频剪辑，涵盖了大量场景和动作。
+
+  处理后的数据集 Vimeo90K 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vimeo90K
+                ├── vimeo_septuplet
+                |    |──sequences
+                |    └──sep_trainlist.txt
+                ├── vimeo_septuplet_BD_matlabLRx4
+                |    └──sequences
+                └── vimeo_super_resolution_test
+                     |──low_resolution
+                     |──target
+                     └──sep_testlist.txt
+              ...
+  ```
+
+  Vid4（[数据下载](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)）数据集是常用的视频超分验证数据集，包含4个视频段。
+
+  处理后的数据集 Vid4 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vid4
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+  UDM10（[数据下载](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)）数据集是常用的视频超分验证数据集，包含10个视频段。
+
+  处理后的数据集 UDM10 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── udm10
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
 ### 1.2.2 训练/测试

  EDVR模型根据模型中间通道数分为EDVR_L(128通道)和EDVR_M(64通道)两种模型。下面以EDVR_M模型为例介绍模型训练与测试。
@@ -59,23 +108,37 @@
     python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
  ```

+  训练或测试其他视频超分模型，可以在``PaddleGAN/configs``文件夹下找到对应模型的配置文件，将命令中的配置文件改成该视频超分模型的配置文件即可。
+

 ## 1.3 实验结果展示
 实验数值结果是在 RGB 通道上进行评估。

 度量指标为 PSNR / SSIM.

+REDS的测试数据集REDS4上的超分性能对比
+| 模型| 参数量（M） | 计算量（G） | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4  | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4  | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4  | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4  | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4  | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4  | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4  | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4  | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4  | 7.42 | 543 | 32.5321 / 0.9083 |
+
+REDS的测试数据集REDS4上的去模糊性能对比
 | 模型 | REDS4 |
 |---|---|
-| EDVR_M_wo_tsa_SRx4  | 30.4429 / 0.8684 |
-| EDVR_M_w_tsa_SRx4  | 30.5169 / 0.8699 |
-| EDVR_L_wo_tsa_SRx4  | 30.8649 / 0.8761 |
-| EDVR_L_w_tsa_SRx4  | 30.9336 / 0.8773 |
 | EDVR_L_wo_tsa_deblur  | 34.9587 / 0.9509 |
 | EDVR_L_w_tsa_deblur  | 35.1473 / 0.9526 |
-| BasicVSR_x4  | 31.4325 / 0.8913 |
-| IconVSR_x4  | 31.6882 / 0.8950 |
-| BasicVSR++_x4  | 32.4018 / 0.9071 |
+
+Vimeo90K，Vid4，UDM10测试数据集上超分性能对比
+| 模型 | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|

 ## 1.4 模型下载
 | 模型 | 数据集 | 下载地址 |
@@ -89,8 +152,9 @@
 | BasicVSR_x4  | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
 | IconVSR_x4  | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
 | BasicVSR++_x4  | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
-
-
+| PP-MSVSR_reds_x4  | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4  | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4  | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)

 # 参考文献

@@ -125,3 +189,10 @@
    year = {2021}
    }
  ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
+
+  ```
+  @article{
+    }
+  ```
--- a/ppgan/metrics/psnr_ssim.py
+++ b/ppgan/metrics/psnr_ssim.py
@@ -212,8 +212,8 @@ def calculate_ssim(img1,
            f'Wrong input_order {input_order}. Supported input_orders are '
            '"HWC" and "CHW"')

-    img1 = img1.copy().astype('float32')[..., ::-1]
-    img2 = img2.copy().astype('float32')[..., ::-1]
+    img1 = img1.copy().astype('float32')
+    img2 = img2.copy().astype('float32')

    img1 = reorder_image(img1, input_order=input_order)
    img2 = reorder_image(img2, input_order=input_order)

--- a/ppgan/models/__init__.py
+++ b/ppgan/models/__init__.py
@@ -33,3 +33,4 @@ from .lapstyle_model import LapStyleDraModel, LapStyleRevFirstModel, LapStyleRev
 from .basicvsr_model import BasicVSRModel
 from .mpr_model import MPRModel
 from .photopen_model import PhotoPenModel
+from .msvsr_model import MultiStageVSRModel
--- a/ppgan/models/generators/__init__.py
+++ b/ppgan/models/generators/__init__.py
@@ -37,3 +37,4 @@ from .gpen import GPEN
 from .pan import PAN
 from .generater_photopen import SPADEGenerator
 from .basicvsr_plus_plus import BasicVSRPlusPlus
+from .msvsr import MSVSR
--- a/ppgan/models/generators/msvsr.py
+++ b/ppgan/models/generators/msvsr.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.vision.ops import DeformConv2D
+
+from .basicvsr import PixelShufflePack, flow_warp, SPyNet, ResidualBlocksWithInputConv
+from ...utils.download import get_path_from_url
+from ...modules.init import kaiming_normal_, constant_
+from .builder import GENERATORS
+
+
+@GENERATORS.register()
+class MSVSR(nn.Layer):
+    """PP-MSVSR network structure for video super-resolution.
+
+    Support only x4 upsampling.
+    Paper:
+        PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+
+    Args:
+        mid_channels (int): Channel number of the intermediate features.
+            Default: 32.
+        num_init_blocks (int): Number of residual blocks in feat_extract.
+            Default: 2.
+        num_blocks (int): Number of residual blocks in each propagation branch.
+            Default: 3.
+        num_reconstruction_blocks (int): Number of residual blocks in reconstruction.
+            Default: 2.
+        only_last (bool): Whether the hr feature only do the last convolution.
+            Default: True.
+        use_tiny_spynet (bool): Whether use tiny spynet.
+            Default: True.
+        deform_groups (int): Number of deformable_groups in DeformConv2D in stage2 and stage3.
+            Defaults: 4.
+        stage1_groups (int): Number of deformable_groups in DeformConv2D in stage1.
+            Defaults: 8.
+        auxiliary_loss (bool): Whether use auxiliary loss.
+            Default: True.
+        use_refine_align (bool): Whether use refine align.
+            Default: True.
+        aux_reconstruction_blocks : Number of residual blocks in auxiliary reconstruction.
+            Default: 1.
+        use_local_connnect (bool): Whether add feature of stage1 after upsample.
+            Default: True.
+    """
+    def __init__(self,
+                 mid_channels=32,
+                 num_init_blocks=2,
+                 num_blocks=3,
+                 num_reconstruction_blocks=2,
+                 only_last=True,
+                 use_tiny_spynet=True,
+                 deform_groups=4,
+                 stage1_groups=8,
+                 auxiliary_loss=True,
+                 use_refine_align=True,
+                 aux_reconstruction_blocks=1,
+                 use_local_connnect=True):
+
+        super().__init__()
+
+        self.mid_channels = mid_channels
+        self.only_last = only_last
+        self.deform_groups = deform_groups
+        self.auxiliary_loss = auxiliary_loss
+        self.use_refine_align = use_refine_align
+        self.use_local_connnect = use_local_connnect
+
+        # optical flow module
+        if use_tiny_spynet:
+            self.spynet = ModifiedSPyNet(num_blocks=3, use_tiny_block=True)
+            weight_path = get_path_from_url(
+                'https://paddlegan.bj.bcebos.com/models/modified_spynet_tiny.pdparams'
+            )
+            self.spynet.set_state_dict(paddle.load(weight_path))
+        else:
+            self.spynet = ModifiedSPyNet(num_blocks=6, use_tiny_block=False)
+            weight_path = get_path_from_url(
+                'https://paddlegan.bj.bcebos.com/models/modified_spynet.pdparams'
+            )
+            self.spynet.set_state_dict(paddle.load(weight_path))
+
+        # feature extraction module
+        self.feat_extract = ResidualBlocksWithInputConv(3, mid_channels,
+                                                        num_init_blocks)
+
+        # propagation branches module for stage2 and stage3
+        self.deform_align = nn.LayerDict()
+        self.backbone = nn.LayerDict()
+
+        prop_names = [
+            'stage2_backward', 'stage2_forward', 'stage3_backward',
+            'stage3_forward'
+        ]
+
+        for i, layer in enumerate(prop_names):
+            if i > 1 and self.use_refine_align:
+                self.deform_align[layer] = ReAlignmentModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    deformable_groups=deform_groups)
+            else:
+                self.deform_align[layer] = AlignmentModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    deformable_groups=deform_groups)
+
+            self.backbone[layer] = ResidualBlocksWithInputConv(
+                (3 + i) * mid_channels, mid_channels, num_blocks)
+
+        # stage1
+        self.stage1_align = AlignmentModule(mid_channels,
+                                            mid_channels,
+                                            3,
+                                            padding=1,
+                                            deformable_groups=stage1_groups)
+        self.stage1_blocks = ResidualBlocksWithInputConv(
+            3 * mid_channels, mid_channels, 3)
+
+        # upsampling module
+        self.reconstruction = ResidualBlocksWithInputConv(
+            6 * mid_channels, mid_channels, num_reconstruction_blocks)
+
+        self.upsample1 = PixelShufflePack(mid_channels,
+                                          mid_channels,
+                                          2,
+                                          upsample_kernel=3)
+        self.upsample2 = PixelShufflePack(mid_channels,
+                                          mid_channels,
+                                          2,
+                                          upsample_kernel=3)
+        if self.only_last:
+            self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+        else:
+            self.conv_hr = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
+            self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+        self.img_upsample = nn.Upsample(scale_factor=4,
+                                        mode='bilinear',
+                                        align_corners=False)
+
+        # activation function
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+        # auxiliary loss
+        if self.auxiliary_loss:
+            self.aux_fusion = nn.Conv2D(mid_channels * 2, mid_channels, 3, 1, 1)
+
+            self.aux_reconstruction = ResidualBlocksWithInputConv(
+                4 * mid_channels, mid_channels, aux_reconstruction_blocks)
+
+            self.aux_block_down1 = nn.Sequential(
+                nn.Conv2D(3 + mid_channels, mid_channels, 3, 2, 1),
+                nn.LeakyReLU(negative_slope=0.1),
+                nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+            self.aux_block_down2 = nn.Sequential(
+                nn.Conv2D(mid_channels * 2, mid_channels, 3, 2, 1),
+                nn.LeakyReLU(negative_slope=0.1),
+                nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+
+            self.aux_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+        self.aux_upsample1 = PixelShufflePack(mid_channels,
+                                              mid_channels,
+                                              2,
+                                              upsample_kernel=3)
+        self.aux_upsample2 = PixelShufflePack(mid_channels,
+                                              mid_channels,
+                                              2,
+                                              upsample_kernel=3)
+        self.hybrid_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+    def check_if_mirror_extended(self, lrs):
+        """Check whether the input is a mirror-extended sequence.
+        If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+        (t-1-i)-th frame.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            Bool: Whether the input is a mirror-extended sequence.
+        """
+
+        with paddle.no_grad():
+            self.is_mirror_extended = False
+            if lrs.shape[1] % 2 == 0:
+                lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+                lrs_2 = paddle.flip(lrs_2, [1])
+                if paddle.norm(lrs_1 - lrs_2) == 0:
+                    self.is_mirror_extended = True
+
+    def compute_flow(self, lrs):
+        """Compute optical flow using pretrained flow network for feature alignment.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            Tuple: Tensor of forward optical flow and backward optical flow with shape (n, t-1, 2, h, w).
+        """
+        n, t, c, h, w = lrs.shape
+
+        lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+        lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+        flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+        if self.is_mirror_extended:
+            flows_forward = flows_backward.flip(1)
+        else:
+            flows_forward = self.spynet(lrs_2,
+                                        lrs_1).reshape([n, t - 1, 2, h, w])
+
+        return flows_forward, flows_backward
+
+    def stage1(self, feats, flows, flows_forward=None):
+        """Stage1 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict with key 'spatial', the value is Array of tensor after feature extraction with shape (n, c, h, w).
+            flows (tensor): Backward optical flow with shape (n, t-1, 2, h, w).
+            flows_forward (tensor): Forward optical flow with shape (n, t-1, 2, h, w).
+
+        Returns:
+            Dict: The input dict with new keys 'feat_stage1', the value of 'feat_stage1' is Array of tensor after Local Fusion Module with shape (n, c, h, w).
+        """
+
+        n, t, _, h, w = flows.shape
+
+        frame_idx = range(t, -1, -1)
+        flow_idx = range(t, -1, -1)
+        mapping_idx = list(range(0, len(feats['spatial'])))
+        mapping_idx += mapping_idx[::-1]
+
+        # Local Fusion Module
+        for i, idx in enumerate(frame_idx):
+            feat_current = feats['spatial'][mapping_idx[idx]]
+
+            # get aligned right adjacent frames
+            if i > 0:
+                feat_prop = feats['spatial'][mapping_idx[idx + 1]]
+                flow_n1 = flows[:, flow_idx[i], :, :, :]
+                cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+                cond = paddle.concat([cond_n1, feat_current], axis=1)
+                feat_prop, _, _ = self.stage1_align(feat_prop, cond, flow_n1)
+            else:
+                feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+            # get aligned left adjacent frames
+            if i < t:
+                feat_back = feats['spatial'][mapping_idx[idx - 1]]
+                flow_n1_ = flows_forward[:, flow_idx[i] - 1, :, :, :]
+                cond_n1_ = flow_warp(feat_back, flow_n1_.transpose([0, 2, 3,
+                                                                    1]))
+                cond_ = paddle.concat([cond_n1_, feat_current], axis=1)
+                feat_back, _, _ = self.stage1_align(feat_back, cond_, flow_n1_)
+            else:
+                feat_back = paddle.zeros([n, self.mid_channels, h, w])
+
+            # concatenate and residual blocks
+            feat = [feat_current] + [feat_prop] + [feat_back]
+            feat = paddle.concat(feat, axis=1)
+            feat = self.stage1_blocks(feat)
+
+            feats['feat_stage1'].append(feat)
+
+        feats['feat_stage1'] = feats['feat_stage1'][::-1]
+
+        return feats
+
+    def stage2(self, feats, flows):
+        """Stage2 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict with key 'spatial' and 'feat_stage1' after stage1.
+            flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+
+        Returns:
+            feats (dict): The input dict with new keys 'stage2_backward' and 'stage2_forward', the value of both is Array of feature after stage2 with shape (n, c, h, w).
+            pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+            pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+        """
+        flows_backward, flows_forward = flows
+        n, t, _, h, w = flows_backward.shape
+
+        pre_offset = {}
+        pre_mask = {}
+
+        # propagation branches module
+        for prop_name in ['stage2_backward', 'stage2_forward']:
+            pre_offset[prop_name] = [0 for _ in range(t)]
+            pre_mask[prop_name] = [0 for _ in range(t)]
+            feats[prop_name] = []
+            frame_idx = range(0, t + 1)
+            flow_idx = range(-1, t)
+            mapping_idx = list(range(0, len(feats['spatial'])))
+            mapping_idx += mapping_idx[::-1]
+
+            if 'backward' in prop_name:
+                frame_idx = frame_idx[::-1]
+                flow_idx = frame_idx
+                flows = flows_backward
+            else:
+                flows = flows_forward
+
+            feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+            for i, idx in enumerate(frame_idx):
+                feat_current = feats['spatial'][mapping_idx[idx]]
+
+                if i > 0:
+                    flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+                    cond_n1 = flow_warp(feat_prop,
+                                        flow_n1.transpose([0, 2, 3, 1]))
+                    cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+                    feat_prop, offset, mask = self.deform_align[prop_name](
+                        feat_prop, cond, flow_n1)
+                    pre_offset[prop_name][flow_idx[i]] = offset
+                    pre_mask[prop_name][flow_idx[i]] = (mask)
+
+                # concatenate and residual blocks
+                feat = [feat_current] + [
+                    feats[k][idx]
+                    for k in feats if k not in ['spatial', prop_name]
+                ] + [feat_prop]
+
+                feat = paddle.concat(feat, axis=1)
+                feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+                feats[prop_name].append(feat_prop)
+
+            if 'backward' in prop_name:
+                feats[prop_name] = feats[prop_name][::-1]
+
+        return feats, pre_offset, pre_mask
+
+    def stage3(self,
+               feats,
+               flows,
+               aux_feats=None,
+               pre_offset=None,
+               pre_mask=None):
+        """Stage3 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict of features after stage2.
+            flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+            aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+            pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+            pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+
+        Returns:
+            feats (dict): The input feats dict with new keys 'stage3_backward' and 'stage3_forward', the value of both is Array of feature after stage3 with shape (n, c, h, w).
+            """
+        flows_backward, flows_forward = flows
+        n, t, _, h, w = flows_backward.shape
+
+        # propagation branches module
+        for prop_name in ['stage3_backward', 'stage3_forward']:
+            feats[prop_name] = []
+            frame_idx = range(0, t + 1)
+            flow_idx = range(-1, t)
+            mapping_idx = list(range(0, len(feats['spatial'])))
+            mapping_idx += mapping_idx[::-1]
+
+            if 'backward' in prop_name:
+                frame_idx = frame_idx[::-1]
+                flow_idx = frame_idx
+                flows = flows_backward
+                pre_stage_name = 'stage2_backward'
+            else:
+                flows = flows_forward
+                pre_stage_name = 'stage2_forward'
+
+            feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+            for i, idx in enumerate(frame_idx):
+                feat_current = feats['spatial'][mapping_idx[idx]]
+                if aux_feats is not None and 'feats' in aux_feats:
+                    feat_current = aux_feats['feats'][mapping_idx[idx]]
+
+                if i > 0:
+                    flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+                    cond_n1 = flow_warp(feat_prop,
+                                        flow_n1.transpose([0, 2, 3, 1]))
+                    cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+                    feat_prop = self.deform_align[prop_name](
+                        feat_prop, cond, flow_n1, feat_current,
+                        pre_offset[pre_stage_name][flow_idx[i]],
+                        pre_mask[pre_stage_name][flow_idx[i]])
+
+                # concatenate and residual blocks
+                feat = [feat_current] + [
+                    feats[k][idx]
+                    for k in feats if k not in ['spatial', prop_name]
+                ] + [feat_prop]
+
+                feat = paddle.concat(feat, axis=1)
+                feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+                feats[prop_name].append(feat_prop)
+
+            if 'backward' in prop_name:
+                feats[prop_name] = feats[prop_name][::-1]
+
+        return feats
+
+    def auxiliary_stage(self, feats, lqs):
+        """Compute the output image and auxiliary feature for Auxiliary Loss in stage2.
+        Args:
+            feats (dict): Dict of features after stage2.
+            lqs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            dict: Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+        """
+        aux_feats = {}
+        aux_feats['outs'] = []
+        aux_feats['feats'] = []
+        num_outputs = len(feats['spatial'])
+
+        mapping_idx = list(range(0, num_outputs))
+        mapping_idx += mapping_idx[::-1]
+
+        for i in range(0, lqs.shape[1]):
+            hr = [feats[k][i] for k in feats if (k != 'spatial')]
+            feat_current = feats['spatial'][mapping_idx[i]]
+            hr.insert(0, feat_current)
+            hr = paddle.concat(hr, axis=1)
+
+            hr_low = self.aux_reconstruction(hr)
+            hr_mid = self.lrelu(self.aux_upsample1(hr_low))
+            hr_high = self.lrelu(self.aux_upsample2(hr_mid))
+
+            hr = self.aux_conv_last(hr_high)
+            hr += self.img_upsample(lqs[:, i, :, :, :])
+
+            # output tensor of auxiliary_stage with shape (n, 3, 4*h, 4*w)
+            aux_feats['outs'].append(hr)
+
+            aux_feat = self.aux_block_down1(paddle.concat([hr, hr_high],
+                                                          axis=1))
+            aux_feat = self.aux_block_down2(
+                paddle.concat([aux_feat, hr_mid], axis=1))
+            aux_feat = self.aux_fusion(paddle.concat([aux_feat, hr_low],
+                                                     axis=1))
+
+            # out feature of auxiliary_stage with shape (n, c, h, w)
+            aux_feats['feats'].append(aux_feat)
+
+        return aux_feats
+
+    def upsample(self, lqs, feats, aux_feats=None):
+        """Compute the output image given the features.
+        Args:
+            lqs (tensor): Input LR images with shape (n, t, c, h, w).
+            feats (dict): Dict of features after stage3.
+            aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+
+        Returns:
+            Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+        """
+
+        outputs = []
+        outputs_head = []
+        num_outputs = len(feats['spatial'])
+
+        mapping_idx = list(range(0, num_outputs))
+        mapping_idx += mapping_idx[::-1]
+
+        cas_outs = []
+        pas = []
+        hrs = []
+        for i in range(0, lqs.shape[1]):
+            hr = [
+                feats[k].pop(0) for k in feats
+                if (k != 'spatial' and k != 'feat_stage1')
+            ]
+            if 'feat_stage1' in feats:
+                local_feat = feats['feat_stage1'].pop(0)
+                hr.insert(0, local_feat)
+            hr.insert(0, feats['spatial'][mapping_idx[i]])
+            hr = paddle.concat(hr, axis=1)
+
+            hr = self.reconstruction(hr)
+
+            hr = self.lrelu(self.upsample1(hr))
+            hr = self.lrelu(self.upsample2(hr))
+            if self.only_last:
+                hr = self.conv_last(hr)
+            else:
+                hr = self.lrelu(self.conv_hr(hr))
+                hr = self.conv_last(hr)
+
+            hr += self.img_upsample(lqs[:, i, :, :, :])
+            if self.use_local_connnect:
+                local_head = self.lrelu(self.aux_upsample1(local_feat))
+                local_head = self.lrelu(self.aux_upsample2(local_head))
+                hr = self.hybrid_conv_last(local_head) + hr
+
+            outputs.append(hr)
+
+        if self.auxiliary_loss:
+            return paddle.stack(aux_feats['outs'],
+                                axis=1), paddle.stack(outputs, axis=1)
+        return paddle.stack(outputs, axis=1)
+
+    def forward(self, lqs):
+        """Forward function for PP-MSVSR.
+        Args:
+            lqs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+        Returns:
+            Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+        """
+
+        n, t, c, h, w = lqs.shape
+
+        lqs_downsample = lqs
+
+        # check whether the input is an extended sequence
+        self.check_if_mirror_extended(lqs)
+
+        feats = {}
+        feats_ = self.feat_extract(lqs.reshape([-1, c, h, w]))
+
+        h, w = feats_.shape[2:]
+        feats_ = feats_.reshape([n, t, -1, h, w])
+        feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
+
+        # compute optical flow using the low-res inputs
+        assert lqs_downsample.shape[3] >= 64 and lqs_downsample.shape[4] >= 64, (
+            'The height and width of low-res inputs must be at least 64, '
+            f'but got {h} and {w}.')
+
+        flows_forward, flows_backward = self.compute_flow(lqs_downsample)
+
+        # feature propgation
+        feats['feat_stage1'] = []
+        feats = self.stage1(feats, flows_backward, flows_forward)
+
+        feats, pre_offset, pre_mask = self.stage2(
+            feats, (flows_backward, flows_forward))
+
+        if self.auxiliary_loss:
+            aux_feats = self.auxiliary_stage(feats, lqs)
+
+        feats = self.stage3(feats, (flows_backward, flows_forward), aux_feats,
+                            pre_offset, pre_mask)
+
+        return self.upsample(lqs, feats, aux_feats=aux_feats)
+
+
+class AlignmentModule(nn.Layer):
+    """deformable alignment module.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        deformable_groups (int): Number of deformable_groups in DeformConv2D.
+    """
+    def __init__(self,
+                 in_channels=128,
+                 out_channels=64,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=16):
+        super(AlignmentModule, self).__init__()
+
+        self.conv_offset = nn.Sequential(
+            nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+        )
+        self.dcn = DeformConv2D(in_channels,
+                                out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                deformable_groups=deformable_groups)
+
+        self.init_offset()
+
+    def init_offset(self):
+        constant_(self.conv_offset[-1].weight, 0)
+        constant_(self.conv_offset[-1].bias, 0)
+
+    def forward(self, x, extra_feat, flow_1):
+        extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+        out = self.conv_offset(extra_feat)
+        o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+        # offset
+        offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+        offset = offset + flow_1.flip(1).tile([1, offset.shape[1] // 2, 1, 1])
+
+        # mask
+        mask = F.sigmoid(mask)
+        out = self.dcn(x, offset, mask)
+        return out, offset, mask
+
+
+class ReAlignmentModule(nn.Layer):
+    """refine deformable alignment module.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        deformable_groups (int): Number of deformable_groups in DeformConv2D.
+    """
+    def __init__(self,
+                 in_channels=128,
+                 out_channels=64,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=16):
+        super(ReAlignmentModule, self).__init__()
+
+        self.mdconv = DeformConv2D(in_channels,
+                                   out_channels,
+                                   kernel_size=kernel_size,
+                                   stride=stride,
+                                   padding=padding,
+                                   dilation=dilation,
+                                   deformable_groups=deformable_groups)
+        self.conv_offset = nn.Sequential(
+            nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+        )
+        self.dcn = DeformConv2D(in_channels,
+                                out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                deformable_groups=deformable_groups)
+
+        self.init_offset()
+
+    def init_offset(self):
+        constant_(self.conv_offset[-1].weight, 0)
+        constant_(self.conv_offset[-1].bias, 0)
+
+    def forward(self,
+                x,
+                extra_feat,
+                flow_1,
+                feat_current,
+                pre_stage_flow=None,
+                pre_stage_mask=None):
+        if pre_stage_flow is not None:
+            pre_feat = self.mdconv(x, pre_stage_flow, pre_stage_mask)
+            extra_feat = paddle.concat([pre_feat, feat_current, flow_1], axis=1)
+        else:
+            extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+        out = self.conv_offset(extra_feat)
+        o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+        # offset
+        offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+        if pre_stage_flow is not None:
+            offset = offset + pre_stage_flow
+        else:
+            offset = offset + flow_1.flip(1).tile(
+                [1, offset.shape[1] // 2, 1, 1])
+
+        # mask
+        if pre_stage_mask is not None:
+            mask = (F.sigmoid(mask) + pre_stage_mask) / 2.0
+        else:
+            mask = F.sigmoid(mask)
+        out = self.dcn(x, offset, mask)
+        return out
+
+
+class ModifiedSPyNet(nn.Layer):
+    """Modified SPyNet network structure.
+
+    The difference to the SPyNet in paper is that
+        1. convolution with kernel_size=7 is replaced by convolution with kernel_size=3 in this version,
+        2. less SPyNetBasicModule is used in this version,
+        3. no BN is used in this version.
+
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+
+    Args:
+        act_cfg (dict): Activation function.
+            Default: dict(name='LeakyReLU').
+        num_blocks (int): Number of SPyNetBlock.
+            Default: 6.
+        use_tiny_block (bool): Whether use tiny spynet.
+            Default: True.
+    """
+    def __init__(self,
+                 act_cfg=dict(name='LeakyReLU'),
+                 num_blocks=6,
+                 use_tiny_block=False):
+        super().__init__()
+        self.num_blocks = num_blocks
+        self.basic_module = nn.LayerList([
+            SPyNetBlock(act_cfg=act_cfg, use_tiny_block=use_tiny_block)
+            for _ in range(num_blocks)
+        ])
+
+        self.register_buffer(
+            'mean',
+            paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+        self.register_buffer(
+            'std',
+            paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+
+    def compute_flow(self, ref, supp):
+        """Compute flow from ref to supp.
+
+        Note that in this function, the images are already resized to a
+        multiple of 32.
+
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+        n, _, h, w = ref.shape
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(self.num_blocks - 1):
+            ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+            supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow = paddle.to_tensor(
+            np.zeros([
+                n, 2, h // (2**(self.num_blocks - 1)), w //
+                (2**(self.num_blocks - 1))
+            ], 'float32'))
+
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(
+                    flow, scale_factor=2, mode='bilinear',
+                    align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](paddle.concat([
+                ref[level],
+                flow_warp(supp[level],
+                          flow_up.transpose([0, 2, 3, 1]),
+                          padding_mode='border'), flow_up
+            ],
+                                                                    axis=1))
+
+        return flow
+
+    def compute_flow_list(self, ref, supp):
+        n, _, h, w = ref.shape
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(self.num_blocks - 1):
+            ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+            supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow_list = []
+        flow = paddle.to_tensor(
+            np.zeros([
+                n, 2, h // (2**(self.num_blocks - 1)), w //
+                (2**(self.num_blocks - 1))
+            ], 'float32'))
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(
+                    flow, scale_factor=2, mode='bilinear',
+                    align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](paddle.concat([
+                ref[level],
+                flow_warp(supp[level],
+                          flow_up.transpose([0, 2, 3, 1]),
+                          padding_mode='border'), flow_up
+            ],
+                                                                    axis=1))
+            flow_list.append(flow)
+        return flow_list
+
+    def forward(self, ref, supp):
+        """Forward function of Modified SPyNet.
+
+        This function computes the optical flow from ref to supp.
+
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+
+        # upsize to a multiple of 32
+        h, w = ref.shape[2:4]
+        w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+        h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+        ref = F.interpolate(ref,
+                            size=(h_up, w_up),
+                            mode='bilinear',
+                            align_corners=False)
+
+        supp = F.interpolate(supp,
+                             size=(h_up, w_up),
+                             mode='bilinear',
+                             align_corners=False)
+
+        ref.stop_gradient = False
+        supp.stop_gradient = False
+
+        # compute flow, and resize back to the original resolution
+        flow = F.interpolate(self.compute_flow(ref, supp),
+                             size=(h, w),
+                             mode='bilinear',
+                             align_corners=False)
+
+        # adjust the flow values
+        flow[:, 0, :, :] *= float(w) / float(w_up)
+        flow[:, 1, :, :] *= float(h) / float(h_up)
+
+        return flow
+
+
+class SPyNetBlock(nn.Layer):
+    """Basic Block of Modified SPyNet.
+    refer to Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    """
+    def __init__(self, act_cfg=dict(name='LeakyReLU'), use_tiny_block=False):
+        super().__init__()
+        if use_tiny_block:
+            self.basic_module = nn.Sequential(
+                ConvLayer(in_channels=8,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=8,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=8,
+                          out_channels=8,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=8,
+                          out_channels=2,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=None))
+        else:
+            self.basic_module = nn.Sequential(
+                ConvLayer(in_channels=8,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=64,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=2,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=None))
+
+    def forward(self, tensor_input):
+        """Forward function of SPyNetBlock.
+        Args:
+            tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+                8 channels contain:
+                [reference image (3), neighbor image (3), initial flow (2)].
+
+        Returns:
+            Tensor: Refined flow with shape (b, 2, h, w)
+        """
+        return self.basic_module(tensor_input)
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 act_cfg=dict(name='ReLU')):
+        super(ConvLayer, self).__init__()
+        self.act_cfg = act_cfg
+        self.with_activation = act_cfg is not None
+
+        self.conv = nn.Conv2D(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              dilation=dilation,
+                              groups=groups)
+
+        if self.with_activation:
+            if act_cfg['name'] == 'ReLU':
+                self.act = paddle.nn.ReLU()
+            elif act_cfg['name'] == 'LeakyReLU':
+                self.act = nn.LeakyReLU(negative_slope=0.1)
+
+    def forward(self, tensor_input):
+        out = self.conv(tensor_input)
+        if self.with_activation:
+            out = self.act(out)
+        return out
--- a/ppgan/models/msvsr_model.py
+++ b/ppgan/models/msvsr_model.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
+from .generators.msvsr import ModifiedSPyNet
+from ..modules.init import reset_parameters
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class MultiStageVSRModel(BaseSRModel):
+    """PP-MSVSR Model.
+
+    Paper:
+        PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+    """
+    def __init__(self, generator, fix_iter, pixel_criterion=None):
+        """Initialize the PP-MSVSR class.
+
+        Args:
+            generator (dict): config of generator.
+            fix_iter (dict): config of fix_iter.
+            pixel_criterion (dict): config of pixel criterion.
+        """
+        super(MultiStageVSRModel, self).__init__(generator, pixel_criterion)
+        self.fix_iter = fix_iter
+        self.current_iter = 1
+        self.flag = True
+        init_basicvsr_weight(self.nets['generator'])
+        if not self.fix_iter:
+            print('init train all parameters!!!')
+            for name, param in self.nets['generator'].named_parameters():
+                param.trainable = True
+                if 'spynet' in name:
+                    param.optimize_attr['learning_rate'] = 0.25
+
+    def setup_input(self, input):
+        self.lq = paddle.to_tensor(input['lq'])
+        self.visual_items['lq'] = self.lq[:, 0, :, :, :]
+        if 'gt' in input:
+            self.gt = paddle.to_tensor(input['gt'])
+            self.visual_items['gt'] = self.gt[:, 0, :, :, :]
+        self.image_paths = input['lq_path']
+
+    def train_iter(self, optims=None):
+        optims['optim'].clear_grad()
+        if self.fix_iter:
+            if self.current_iter == 1:
+                print('Train MSVSR with fixed spynet for', self.fix_iter,
+                      'iters.')
+                for name, param in self.nets['generator'].named_parameters():
+                    if 'spynet' in name:
+                        param.trainable = False
+            elif self.current_iter >= self.fix_iter + 1 and self.flag:
+                print('Train all the parameters.')
+                for name, param in self.nets['generator'].named_parameters():
+                    param.trainable = True
+                    if 'spynet' in name:
+                        param.optimize_attr['learning_rate'] = 0.25
+                self.flag = False
+                for net in self.nets.values():
+                    net.find_unused_parameters = False
+
+        output = self.nets['generator'](self.lq)
+        if isinstance(output, (list, tuple)):
+            out_stage2, output = output
+            loss_pix_stage2 = self.pixel_criterion(out_stage2, self.gt)
+            self.losses['loss_pix_stage2'] = loss_pix_stage2
+        self.visual_items['output'] = output[:, 0, :, :, :]
+        # pixel loss
+        loss_pix = self.pixel_criterion(output, self.gt)
+        self.losses['loss_pix'] = loss_pix
+
+        self.loss = sum(_value for _key, _value in self.losses.items()
+                        if 'loss_pix' in _key)
+        self.losses['loss'] = self.loss
+
+        self.loss.backward()
+        optims['optim'].step()
+
+        self.current_iter += 1
+
+    def test_iter(self, metrics=None):
+        self.gt = self.gt.cpu()
+        self.nets['generator'].eval()
+        with paddle.no_grad():
+            output = self.nets['generator'](self.lq)
+            if isinstance(output, (list, tuple)):
+                out_stage1, output = output
+        self.nets['generator'].train()
+
+        out_img = []
+        gt_img = []
+
+        _, t, _, _, _ = self.gt.shape
+        for i in range(t):
+            out_tensor = output[0, i]
+            gt_tensor = self.gt[0, i]
+            out_img.append(tensor2img(out_tensor, (0., 1.)))
+            gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+        if metrics is not None:
+            for metric in metrics.values():
+                metric.update(out_img, gt_img, is_seq=True)
+
+
+def init_basicvsr_weight(net):
+    for m in net.children():
+        if hasattr(m,
+                   'weight') and not isinstance(m,
+                                                (nn.BatchNorm, nn.BatchNorm2D)):
+            reset_parameters(m)
+            continue
+
+        if (not isinstance(
+                m,
+            (ResidualBlockNoBN, PixelShufflePack, SPyNet, ModifiedSPyNet))):
+            init_basicvsr_weight(m)