diff --git a/benchmark/README.md b/benchmark/README.md
index 1f8ee7b77adc78c373d014bbf9959f00f04f8b1d..e24b67d5bd54510f39f3fcc0ad3a08c071c52013 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -39,7 +39,7 @@ run_cmd="set -xe;
 
 nvidia-docker run --name test_paddlegan -i  \
     --net=host \
-    --shm-size=1g \
+    --shm-size=128g \
     -v $PWD:/workspace \
     ${ImageName}  /bin/bash -c "${run_cmd}"
 ```
diff --git a/configs/basicvsr++_vimeo90k_BD.yaml b/configs/basicvsr++_vimeo90k_BD.yaml
index 3b1c8691ef0e844cd730dd70a8d05f7ef57e1f14..ec4294d73744f070644d184a55ce7de5d0c1b70d 100644
--- a/configs/basicvsr++_vimeo90k_BD.yaml
+++ b/configs/basicvsr++_vimeo90k_BD.yaml
@@ -59,7 +59,7 @@ dataset:
 
   test:
     name: VSRFolderDataset
-    # for udm10 dataset
+    # for UDM10 dataset
     # lq_folder: data/udm10/BDx4
     # gt_folder: data/udm10/GT
     lq_folder: data/Vid4/BDx4
@@ -67,7 +67,7 @@ dataset:
     preprocess:
         - name: GetNeighboringFramesIdx
           interval_list: [1]
-          # for udm10 dataset
+          # for UDM10 dataset
           # filename_tmpl: '{:04d}.png'
           filename_tmpl: '{:08d}.png'
         - name: ReadImageSequence
diff --git a/configs/basicvsr_reds.yaml b/configs/basicvsr_reds.yaml
index 9e034f009708edc4b5f42eb22a148b41899cc650..0b6b835e0a407a5c7a1693f88cc7ba5ea0106cac 100644
--- a/configs/basicvsr_reds.yaml
+++ b/configs/basicvsr_reds.yaml
@@ -23,8 +23,8 @@ dataset:
   train:
     name: RepeatDataset
     times: 1000
-    num_workers: 4  # 6
-    batch_size: 2  # 4*2
+    num_workers: 4
+    batch_size: 2  #4 GPUs
     dataset:
       name: SRREDSMultipleGTDataset
       mode: train
diff --git a/configs/edvr_l_w_tsa.yaml b/configs/edvr_l_w_tsa.yaml
index 9f95deb7e68c74a01568ddd7b929c4576af51e80..6c179296d87d0ff085233b223d8f73386e963df4 100644
--- a/configs/edvr_l_w_tsa.yaml
+++ b/configs/edvr_l_w_tsa.yaml
@@ -43,7 +43,7 @@ dataset:
     scale: 4
     fix_random_seed: 10
     num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs
 
 
   test:
diff --git a/configs/edvr_l_wo_tsa.yaml b/configs/edvr_l_wo_tsa.yaml
index facbed516aae3c1c6ac4aeda75180e7244b1e586..d20901016132a7eaa602f647b99ae50dd60e86cf 100644
--- a/configs/edvr_l_wo_tsa.yaml
+++ b/configs/edvr_l_wo_tsa.yaml
@@ -42,7 +42,7 @@ dataset:
     scale: 4
     fix_random_seed: 10
     num_workers: 3
-    batch_size: 4  # 8GUPs
+    batch_size: 4  # 8GPUs
 
 
   test:
diff --git a/configs/edvr_m_w_tsa.yaml b/configs/edvr_m_w_tsa.yaml
index e40fa0c9dfecb6f8689b81d6c4f39d02956d4175..aef89cbfbe840f46d958e831f632bc10d18aab3d 100644
--- a/configs/edvr_m_w_tsa.yaml
+++ b/configs/edvr_m_w_tsa.yaml
@@ -46,7 +46,7 @@ dataset:
     scale: 4
     fix_random_seed: 10
     num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs
 
 
   test:
diff --git a/configs/edvr_m_wo_tsa.yaml b/configs/edvr_m_wo_tsa.yaml
index 3817cf1390be47137119b22ef6af6b582f5c2456..d6502edcf0eb3cfb471e7ecaf6d7a585c3cd1c66 100644
--- a/configs/edvr_m_wo_tsa.yaml
+++ b/configs/edvr_m_wo_tsa.yaml
@@ -42,7 +42,7 @@ dataset:
     scale: 4
     fix_random_seed: 10
     num_workers: 3
-    batch_size: 4 # 8GUPs
+    batch_size: 4 # 8GPUs
 
 
   test:
diff --git a/configs/iconvsr_reds.yaml b/configs/iconvsr_reds.yaml
index 314d29a793fe22ac723a5543d03f2e97d054e557..32ea630b25bd1c3713ac4c55ce3e46aa651f50a1 100644
--- a/configs/iconvsr_reds.yaml
+++ b/configs/iconvsr_reds.yaml
@@ -23,8 +23,8 @@ dataset:
   train:
     name: RepeatDataset
     times: 1000
-    num_workers: 4  # 6
-    batch_size: 2  # 4*2
+    num_workers: 4
+    batch_size: 2  #4 GPUs
     dataset:
       name: SRREDSMultipleGTDataset
       mode: train
diff --git a/configs/lapstyle_draft.yaml b/configs/lapstyle_draft.yaml
index af355bccfdae81345cfdd728e15d326a68c6c754..46c4a3f23b875e0fee9711a8743188bc51290886 100644
--- a/configs/lapstyle_draft.yaml
+++ b/configs/lapstyle_draft.yaml
@@ -32,7 +32,7 @@ dataset:
     load_size: 136
     crop_size: 128
     num_workers: 16
-    batch_size: 5
+    batch_size: 5  #1 GPUs
   test:
     name: LapStyleDataset
     content_root: data/coco/test2017/
diff --git a/configs/lapstyle_rev_first.yaml b/configs/lapstyle_rev_first.yaml
index 50654f5951bd27c4a4350d56ebe0ed317ab371fb..e0158dea002d6ea27450a4a5ae2bcc0f20847293 100644
--- a/configs/lapstyle_rev_first.yaml
+++ b/configs/lapstyle_rev_first.yaml
@@ -38,7 +38,7 @@ dataset:
     load_size: 280
     crop_size: 256
     num_workers: 16
-    batch_size: 5
+    batch_size: 5 #1 GPUs
   test:
     name: LapStyleDataset
     content_root: data/coco/test2017/
diff --git a/configs/lapstyle_rev_second.yaml b/configs/lapstyle_rev_second.yaml
index 3f212feb9b52dc93868dde475fa41eebaf504f2b..167ecb7708de60a9c4a34487041ec51158073716 100644
--- a/configs/lapstyle_rev_second.yaml
+++ b/configs/lapstyle_rev_second.yaml
@@ -38,7 +38,7 @@ dataset:
     load_size: 540
     crop_size: 512
     num_workers: 16
-    batch_size: 2
+    batch_size: 2 #1 GPUs
   test:
     name: LapStyleDataset
     content_root: data/coco/test2017/
diff --git a/configs/msvsr_l_reds.yaml b/configs/msvsr_l_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f5c02227e268b523e457bfb2d69f87a14de60f4
--- /dev/null
+++ b/configs/msvsr_l_reds.yaml
@@ -0,0 +1,102 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: 2500
+  generator:
+    name: MSVSR
+    mid_channels: 64
+    num_init_blocks: 5
+    num_blocks: 7
+    num_reconstruction_blocks: 5
+    only_last: False
+    use_tiny_spynet: False
+    deform_groups: 8
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 2
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 4
+    batch_size: 2  #8 gpus
+    use_shared_memory: True
+    dataset:
+      name: SRREDSMultipleGTDataset
+      mode: train
+      lq_folder: data/REDS/train_sharp_bicubic/X4
+      gt_folder: data/REDS/train_sharp/X4
+      crop_size: 256
+      interval_list: [1]
+      random_reverse: False
+      number_frames: 30
+      use_flip: True
+      use_rot: True
+      scale: 4
+      val_partition: REDS4
+
+  test:
+    name: SRREDSMultipleGTDataset
+    mode: test
+    lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+    gt_folder: data/REDS/REDS4_test_sharp/X4
+    interval_list: [1]
+    random_reverse: False
+    number_frames: 100
+    use_flip: False
+    use_rot: False
+    scale: 4
+    val_partition: REDS4
+    num_workers: 0
+    batch_size: 1
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [300000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 5000
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: false
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: false
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 5000
diff --git a/configs/msvsr_reds.yaml b/configs/msvsr_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ddc333c41e853b27649418cf07f6695a7d0d334
--- /dev/null
+++ b/configs/msvsr_reds.yaml
@@ -0,0 +1,102 @@
+total_iters: 150000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: 2500
+  generator:
+    name: MSVSR
+    mid_channels: 32
+    num_init_blocks: 2
+    num_blocks: 3
+    num_reconstruction_blocks: 2
+    only_last: True
+    use_tiny_spynet: True
+    deform_groups: 4
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 1
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 6
+    batch_size: 2  #8 gpus
+    use_shared_memory: True
+    dataset:
+      name: SRREDSMultipleGTDataset
+      mode: train
+      lq_folder: data/REDS/train_sharp_bicubic/X4
+      gt_folder: data/REDS/train_sharp/X4
+      crop_size: 256
+      interval_list: [1]
+      random_reverse: False
+      number_frames: 20
+      use_flip: True
+      use_rot: True
+      scale: 4
+      val_partition: REDS4
+
+  test:
+    name: SRREDSMultipleGTDataset
+    mode: test
+    lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+    gt_folder: data/REDS/REDS4_test_sharp/X4
+    interval_list: [1]
+    random_reverse: False
+    number_frames: 100
+    use_flip: False
+    use_rot: False
+    scale: 4
+    val_partition: REDS4
+    num_workers: 0
+    batch_size: 1
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [150000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 5000
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: false
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: false
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 5000
diff --git a/configs/msvsr_vimeo90k_BD.yaml b/configs/msvsr_vimeo90k_BD.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7496c3a671138dbe2b446fa5bb30fb990ca2beda
--- /dev/null
+++ b/configs/msvsr_vimeo90k_BD.yaml
@@ -0,0 +1,131 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+  (0., 1.)
+
+model:
+  name: MultiStageVSRModel
+  fix_iter: -1
+  generator:
+    name: MSVSR
+    mid_channels: 32
+    num_init_blocks: 2
+    num_blocks: 3
+    num_reconstruction_blocks: 2
+    only_last: True
+    use_tiny_spynet: True
+    deform_groups: 4
+    stage1_groups: 8
+    auxiliary_loss: True
+    use_refine_align: True
+    aux_reconstruction_blocks: 1
+    use_local_connnect: True
+  pixel_criterion:
+    name: CharbonnierLoss
+    reduction: mean
+
+dataset:
+  train:
+    name: RepeatDataset
+    times: 1000
+    num_workers: 4
+    batch_size: 2  #8 gpus
+    dataset:
+      name: VSRVimeo90KDataset
+      # mode: train
+      lq_folder: data/vimeo90k/vimeo_septuplet_BD_matlabLRx4/sequences
+      gt_folder: data/vimeo90k/vimeo_septuplet/sequences
+      ann_file: data/vimeo90k/vimeo_septuplet/sep_trainlist.txt
+      preprocess:
+        - name: ReadImageSequence
+          key: lq
+        - name: ReadImageSequence
+          key: gt
+        - name: Transforms
+          input_keys: [lq, gt]
+          pipeline:
+            - name: SRPairedRandomCrop
+              gt_patch_size: 256
+              scale: 4
+              keys: [image, image]
+            - name: PairedRandomHorizontalFlip
+              keys: [image, image]
+            - name: PairedRandomVerticalFlip
+              keys: [image, image]
+            - name: PairedRandomTransposeHW
+              keys: [image, image]
+            - name: TransposeSequence
+              keys: [image, image]
+            - name: MirrorVideoSequence
+            - name: NormalizeSequence
+              mean: [0., .0, 0.]
+              std: [255., 255., 255.]
+              keys: [image, image]
+
+  test:
+    name: VSRFolderDataset
+    # for udm10 dataset
+    # lq_folder: data/udm10/BDx4
+    # gt_folder: data/udm10/GT
+    lq_folder: data/Vid4/BDx4
+    gt_folder: data/Vid4/GT
+    preprocess:
+        - name: GetNeighboringFramesIdx
+          interval_list: [1]
+          # for udm10 dataset
+          # filename_tmpl: '{:04d}.png'
+          filename_tmpl: '{:08d}.png'
+        - name: ReadImageSequence
+          key: lq
+        - name: ReadImageSequence
+          key: gt
+        - name: Transforms
+          input_keys: [lq, gt]
+          pipeline:
+            - name: TransposeSequence
+              keys: [image, image]
+            - name: NormalizeSequence
+              mean: [0., .0, 0.]
+              std: [255., 255., 255.]
+              keys: [image, image]
+
+lr_scheduler:
+  name: CosineAnnealingRestartLR
+  learning_rate: !!float 2e-4
+  periods: [300000]
+  restart_weights: [1]
+  eta_min: !!float 1e-7
+
+optimizer:
+  name: Adam
+  # add parameters of net_name to optim
+  # name should in self.nets
+  net_names:
+    - generator
+  beta1: 0.9
+  beta2: 0.99
+
+validate:
+  interval: 2500
+  save_img: false
+
+  metrics:
+    psnr: # metric name, can be arbitrary
+      name: PSNR
+      crop_border: 0
+      test_y_channel: true
+    ssim:
+      name: SSIM
+      crop_border: 0
+      test_y_channel: true
+
+log_config:
+  interval: 10
+  visiual_interval: 5000
+
+snapshot_config:
+  interval: 2500
diff --git a/docs/en_US/tutorials/video_super_resolution.md b/docs/en_US/tutorials/video_super_resolution.md
index 4c482fab084b6e9c728d849d3df4cd66803ea8ca..380015da4300dc354ce5c053a04707d01595bb77 100644
--- a/docs/en_US/tutorials/video_super_resolution.md
+++ b/docs/en_US/tutorials/video_super_resolution.md
@@ -3,15 +3,22 @@
 
 ## 1.1 Principle
 
-  Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf).[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
+  Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf), [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf), and PP-MSVSR.
 
   [EDVR](https://arxiv.org/pdf/1905.02716.pdf) wins the champions and outperforms the second place by a large margin in all four tracks in the NTIRE19 video restoration and enhancement challenges. The main difficulties of video super-resolution from two aspects: (1) how to align multiple frames given large motions, and (2) how to effectively fuse different frames with diverse motion and blur. First, to handle large motions, EDVR devise a Pyramid, Cascading and Deformable (PCD) alignment module, in which frame alignment is done at the feature level using deformable convolutions in a coarse-to-fine manner. Second, EDVR propose a Temporal and Spatial Attention (TSA) fusion module, in which attention is applied both temporally and spatially, so as to emphasize important features for subsequent restoration.
 
+  [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf) reconsiders some most essential components for VSR guided by four basic functionalities, i.e., Propagation, Alignment, Aggregation, and Upsampling. By reusing some existing components added with minimal redesigns, a succinct pipeline, BasicVSR, achieves appealing improvements in terms of speed and restoration quality in comparison to many state-of-the-art algorithms. By presenting an informationrefill mechanism and a coupled propagation scheme to facilitate information aggregation, the BasicVSR can be expanded to [IconVSR](https://arxiv.org/pdf/2012.02181.pdf), which can serve as strong baselines for future VSR approaches.
+
+  [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf) redesign BasicVSR by proposing second-order grid propagation and flowguided deformable alignment. By empowering the recurrent framework with the enhanced propagation and alignment, BasicVSR++ can exploit spatiotemporal information across misaligned video frames more effectively. The new components lead to an improved performance under a similar computational constraint. In particular, BasicVSR++ surpasses BasicVSR by 0.82 dB in PSNR with similar number of parameters. In NTIRE 2021, BasicVSR++ obtains three champions and one runner-up in the Video Super-Resolution and Compressed Video Enhancement Challenges.
+
+  PP-MSVSR is a multi-stage VSR deep architecture,  with local fusion module, auxiliary loss and refined align module to refine the enhanced result progressively. Specifically, in order to strengthen the fusion of features across frames in feature propagation, a local fusion module is designed in stage-1 to perform local feature fusion before feature propagation. Moreover, an auxiliary loss in stage-2 is introduced to make the features obtained by the propagation module reserve more correlated information connected to the HR space, and introduced a refined align module in stage-3 to make full use of the feature information of the previous stage. Extensive experiments substantiate that PP-MSVSR achieves a promising performance of Vid4 datasets, which PSNR metric can achieve 28.13 with only 1.45M parameters.
+
 
 
 ## 1.2 How to use  
 
 ### 1.2.1 Prepare Datasets
+  Here are 4 commonly used video super-resolution dataset, REDS, Vimeo90K, Vid4, UDM10. The REDS and Vimeo90K dataset include train dataset and test dataset, Vid4 and UDM10 are test dataset. Download and decompress the required dataset and place it under the ``PaddleGAN/data``.
 
   REDS（[download](https://seungjunnah.github.io/Datasets/reds.html)）is a newly proposed high-quality (720p) video dataset in the NTIRE19 Competition. REDS consists of 240 training clips, 30 validation clips and 30 testing clips (each with 100 consecutive frames). Since the test ground truth is not available, we select four representative clips (they are '000', '011', '015', '020', with diverse scenes and motions) as our test set, denoted by REDS4. The remaining training and validation clips are re-grouped as our training dataset (a total of 266 clips).
 
@@ -31,6 +38,49 @@
               ...
   ```
 
+  Vimeo90K ([download](http://toflow.csail.mit.edu/)) is designed by Tianfan Xue etc. for the following four video processing tasks: temporal frame interpolation, video denoising, video deblocking, and video super-resolution. Vimeo90K is a large-scale, high-quality video dataset. This dataset consists of 89,800 video clips downloaded from vimeo.com, which covers large variaty of scenes and actions.
+
+  The structure of the processed Vimeo90K is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vimeo90K
+                ├── vimeo_septuplet
+                |    |──sequences
+                |    └──sep_trainlist.txt
+                ├── vimeo_septuplet_BD_matlabLRx4
+                |    └──sequences
+                └── vimeo_super_resolution_test
+                     |──low_resolution
+                     |──target
+                     └──sep_testlist.txt
+              ...
+  ```
+
+  Vid4 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)) is a commonly used test dataset for VSR, which contains 4 video segments.
+  The structure of the processed Vid4 is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vid4
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+  UDM10 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)) is a commonly used test dataset for VSR, which contains 10 video segments.
+  The structure of the processed UDM10 is as follows:
+  ```
+    PaddleGAN
+      ├── data
+          ├── udm10
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+
+
 ### 1.2.2 Train/Test
 
   According to the number of channels, EDVR are divided into EDVR_L(128 channels) and EDVR_M (64 channels). Then, taking EDVR_M as an example, the model training and testing are introduced.
@@ -63,24 +113,37 @@
      python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
   ```
 
+  To train or test other VSR model, you can find the config file of the corresponding VSR model in the ``PaddleGAN/configs``, then change the config file in the command to the config file of corresponding VSR model.
+
 
 ## 1.3 Results
 The experimental results are evaluated on RGB channel.
 
 The metrics are PSNR / SSIM.
 
+VSR quantitative comparis on the test dataset REDS4 from REDS dataset
+| Method | Paramete(M) | FLOPs(G) | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4  | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4  | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4  | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4  | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4  | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4  | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4  | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4  | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4  | 7.42 | 543 | 32.5321 / 0.9083 |
+
+Deblur quantitative comparis on the test dataset REDS4 from REDS dataset
 | Method | REDS4 |
 |---|---|
-| EDVR_M_wo_tsa_SRx4  | 30.4429 / 0.8684 |
-| EDVR_M_w_tsa_SRx4  | 30.5169 / 0.8699 |
-| EDVR_L_wo_tsa_SRx4  | 30.8649 / 0.8761 |
-| EDVR_L_w_tsa_SRx4  | 30.9336 / 0.8773 |
 | EDVR_L_wo_tsa_deblur  | 34.9587 / 0.9509 |
 | EDVR_L_w_tsa_deblur  | 35.1473 / 0.9526 |
-| BasicVSR_x4  | 31.4325 / 0.8913 |
-| IconVSR_x4  | 31.6882 / 0.8950 |
-| BasicVSR++_x4  | 32.4018 / 0.9071 |
 
+VSR quantitative comparis on the Vimeo90K, Vid4, UDM10
+| Model | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
 
 ## 1.4 Model Download
 | Method | Dataset | Download Link |
@@ -94,7 +157,9 @@ The metrics are PSNR / SSIM.
 | BasicVSR_x4  | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
 | IconVSR_x4  | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
 | BasicVSR++_x4  | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
-
+| PP-MSVSR_reds_x4  | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4  | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4  | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
 
 
 
@@ -133,3 +198,10 @@ The metrics are PSNR / SSIM.
     year = {2021}
     }
   ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
+
+  ```
+  @article{
+    }
+  ```
diff --git a/docs/zh_CN/tutorials/video_super_resolution.md b/docs/zh_CN/tutorials/video_super_resolution.md
index 0e88959c300de0c1839b03f2611f04c45be8657c..8eb458cfc96750da070320804f7ca7874d68f897 100644
--- a/docs/zh_CN/tutorials/video_super_resolution.md
+++ b/docs/zh_CN/tutorials/video_super_resolution.md
@@ -3,16 +3,22 @@
 
 ## 1.1 原理介绍
 
-  视频超分源于图像超分，其目的是从一个或多个低分辨率（LR）图像中恢复高分辨率（HR）图像。它们的区别也很明显，由于视频是由多个帧组成的，所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf),[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf).
+  视频超分源于图像超分，其目的是从一个或多个低分辨率（LR）图像中恢复高分辨率（HR）图像。它们的区别也很明显，由于视频是由多个帧组成的，所以视频超分通常利用帧间的信息来进行修复。这里我们提供视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf)，[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)，[IconVSR](https://arxiv.org/pdf/2012.02181.pdf)，[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)和PP-MSVSR。
 
   [EDVR](https://arxiv.org/pdf/1905.02716.pdf)模型在NTIRE19视频恢复和增强挑战赛的四个赛道中都赢得了冠军，并以巨大的优势超过了第二名。视频超分的主要难点在于（1）如何在给定大运动的情况下对齐多个帧；（2）如何有效地融合具有不同运动和模糊的不同帧。首先，为了处理大的运动，EDVR模型设计了一个金字塔级联的可变形（PCD）对齐模块，在该模块中，从粗到精的可变形卷积被使用来进行特征级的帧对齐。其次，EDVR使用了时空注意力（TSA）融合模块，该模块在时间和空间上同时应用注意力机制，以强调后续恢复的重要特征。
 
+  [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)在VSR的指导下重新考虑了四个基本模块（即传播、对齐、聚合和上采样）的一些最重要的组件。 通过添加一些小设计，重用一些现有组件，得到了简洁的 BasicVSR。与许多最先进的算法相比，BasicVSR在速度和恢复质量方面实现了有吸引力的改进。 同时，通过添加信息重新填充机制和耦合传播方案以促进信息聚合，BasicVSR 可以扩展为 [IconVSR](https://arxiv.org/pdf/2012.02181.pdf)，IconVSR可以作为未来 VSR 方法的强大基线 .
 
+  [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)通过提出二阶网格传播和导流可变形对齐来重新设计BasicVSR。通过增强传播和对齐来增强循环框架，BasicVSR++可以更有效地利用未对齐视频帧的时空信息。 在类似的计算约束下，新组件可提高性能。特别是，BasicVSR++ 以相似的参数数量在 PSNR 方面比 BasicVSR 高0.82dB。BasicVSR++ 在NTIRE2021的视频超分辨率和压缩视频增强挑战赛中获得三名冠军和一名亚军。
+
+  PP-MSVSR是一种多阶段视频超分深度架构，具有局部融合模块、辅助损失和细化对齐模块，以逐步细化增强结果。具体来说，在第一阶段设计了局部融合模块，在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失，使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块，以充分利用前一阶段传播模块的特征信息。大量实验证实，PP-MSVSR在Vid4数据集性能优异，仅使用 1.45M 参数PSNR指标即可达到28.13dB。
 
 ## 1.2 如何使用
 
 ### 1.2.1 数据准备
 
+  这里提供4个视频超分辨率常用数据集，REDS，Vimeo90K，Vid4，UDM10。其中REDS和vimeo90k数据集包括训练集和测试集，Vid4和UDM10为测试数据集。将需要的数据集下载解压后放到``PaddleGAN/data``文件夹下 。
+
   REDS（[数据下载](https://seungjunnah.github.io/Datasets/reds.html)）数据集是NTIRE19公司最新提出的高质量（720p）视频数据集，其由240个训练片段、30个验证片段和30个测试片段组成（每个片段有100个连续帧）。由于测试数据集不可用，这里在训练集选择了四个具有代表性的片段（分别为'000', '011', '015', '020'，它们具有不同的场景和动作）作为测试集，用REDS4表示。剩下的训练和验证片段被重新分组为训练数据集（总共266个片段）。
 
   处理后的数据集 REDS 的组成形式如下:
@@ -31,6 +37,49 @@
               ...
   ```
 
+  Vimeo90K（[数据下载](http://toflow.csail.mit.edu/)）数据集是Tianfan Xue等人构建的一个用于视频超分、视频降噪、视频去伪影、视频插帧的数据集。Vimeo90K是大规模、高质量的视频数据集，包含从vimeo.com下载的 89,800 个视频剪辑，涵盖了大量场景和动作。
+
+  处理后的数据集 Vimeo90K 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vimeo90K
+                ├── vimeo_septuplet
+                |    |──sequences
+                |    └──sep_trainlist.txt
+                ├── vimeo_septuplet_BD_matlabLRx4
+                |    └──sequences
+                └── vimeo_super_resolution_test
+                     |──low_resolution
+                     |──target
+                     └──sep_testlist.txt
+              ...
+  ```
+
+  Vid4（[数据下载](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)）数据集是常用的视频超分验证数据集，包含4个视频段。
+
+  处理后的数据集 Vid4 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── Vid4
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
+  UDM10（[数据下载](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)）数据集是常用的视频超分验证数据集，包含10个视频段。
+
+  处理后的数据集 UDM10 的组成形式如下:
+  ```
+    PaddleGAN
+      ├── data
+          ├── udm10
+                ├── BDx4
+                └── GT
+              ...
+  ```
+
 ### 1.2.2 训练/测试
 
   EDVR模型根据模型中间通道数分为EDVR_L(128通道)和EDVR_M(64通道)两种模型。下面以EDVR_M模型为例介绍模型训练与测试。
@@ -59,23 +108,37 @@
      python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
   ```
 
+  训练或测试其他视频超分模型，可以在``PaddleGAN/configs``文件夹下找到对应模型的配置文件，将命令中的配置文件改成该视频超分模型的配置文件即可。
+
 
 ## 1.3 实验结果展示
 实验数值结果是在 RGB 通道上进行评估。
 
 度量指标为 PSNR / SSIM.
 
+REDS的测试数据集REDS4上的超分性能对比
+| 模型| 参数量（M） | 计算量（G） | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4  | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4  | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4  | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4  | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4  | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4  | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4  | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4  | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4  | 7.42 | 543 | 32.5321 / 0.9083 |
+
+REDS的测试数据集REDS4上的去模糊性能对比
 | 模型 | REDS4 |
 |---|---|
-| EDVR_M_wo_tsa_SRx4  | 30.4429 / 0.8684 |
-| EDVR_M_w_tsa_SRx4  | 30.5169 / 0.8699 |
-| EDVR_L_wo_tsa_SRx4  | 30.8649 / 0.8761 |
-| EDVR_L_w_tsa_SRx4  | 30.9336 / 0.8773 |
 | EDVR_L_wo_tsa_deblur  | 34.9587 / 0.9509 |
 | EDVR_L_w_tsa_deblur  | 35.1473 / 0.9526 |
-| BasicVSR_x4  | 31.4325 / 0.8913 |
-| IconVSR_x4  | 31.6882 / 0.8950 |
-| BasicVSR++_x4  | 32.4018 / 0.9071 |
+
+Vimeo90K，Vid4，UDM10测试数据集上超分性能对比
+| 模型 | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
 
 ## 1.4 模型下载
 | 模型 | 数据集 | 下载地址 |
@@ -89,8 +152,9 @@
 | BasicVSR_x4  | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
 | IconVSR_x4  | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
 | BasicVSR++_x4  | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
-
-
+| PP-MSVSR_reds_x4  | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4  | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4  | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
 
 # 参考文献
 
@@ -125,3 +189,10 @@
     year = {2021}
     }
   ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution]()
+
+  ```
+  @article{
+    }
+  ```
diff --git a/ppgan/metrics/psnr_ssim.py b/ppgan/metrics/psnr_ssim.py
index e6885fb044584b9ae4dc39b2f07ec5a674bdb397..7ed288a295169670f79e3911c3b6aa15d34bcf81 100644
--- a/ppgan/metrics/psnr_ssim.py
+++ b/ppgan/metrics/psnr_ssim.py
@@ -212,8 +212,8 @@ def calculate_ssim(img1,
             f'Wrong input_order {input_order}. Supported input_orders are '
             '"HWC" and "CHW"')
 
-    img1 = img1.copy().astype('float32')[..., ::-1]
-    img2 = img2.copy().astype('float32')[..., ::-1]
+    img1 = img1.copy().astype('float32')
+    img2 = img2.copy().astype('float32')
 
     img1 = reorder_image(img1, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
diff --git a/ppgan/models/__init__.py b/ppgan/models/__init__.py
index 5f4c175512befd07aa17a10fad60bf1d176e5ee1..c1df8bc6d08078dc86a43f72cfb17b4626f23cba 100644
--- a/ppgan/models/__init__.py
+++ b/ppgan/models/__init__.py
@@ -33,3 +33,4 @@ from .lapstyle_model import LapStyleDraModel, LapStyleRevFirstModel, LapStyleRev
 from .basicvsr_model import BasicVSRModel
 from .mpr_model import MPRModel
 from .photopen_model import PhotoPenModel
+from .msvsr_model import MultiStageVSRModel
diff --git a/ppgan/models/generators/__init__.py b/ppgan/models/generators/__init__.py
index 333526273bfd538b1859c2c232ac9b9d3816787d..24926b0ef59a56c0a002a3400f165765b9369b21 100755
--- a/ppgan/models/generators/__init__.py
+++ b/ppgan/models/generators/__init__.py
@@ -36,4 +36,5 @@ from .iconvsr import IconVSR
 from .gpen import GPEN
 from .pan import PAN
 from .generater_photopen import SPADEGenerator
-from .basicvsr_plus_plus import BasicVSRPlusPlus
\ No newline at end of file
+from .basicvsr_plus_plus import BasicVSRPlusPlus
+from .msvsr import MSVSR
diff --git a/ppgan/models/generators/msvsr.py b/ppgan/models/generators/msvsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ce3c8c4897ce6976019eabe0e9a33db755da42
--- /dev/null
+++ b/ppgan/models/generators/msvsr.py
@@ -0,0 +1,1106 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.vision.ops import DeformConv2D
+
+from .basicvsr import PixelShufflePack, flow_warp, SPyNet, ResidualBlocksWithInputConv
+from ...utils.download import get_path_from_url
+from ...modules.init import kaiming_normal_, constant_
+from .builder import GENERATORS
+
+
+@GENERATORS.register()
+class MSVSR(nn.Layer):
+    """PP-MSVSR network structure for video super-resolution.
+
+    Support only x4 upsampling.
+    Paper:
+        PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+
+    Args:
+        mid_channels (int): Channel number of the intermediate features.
+            Default: 32.
+        num_init_blocks (int): Number of residual blocks in feat_extract.
+            Default: 2.
+        num_blocks (int): Number of residual blocks in each propagation branch.
+            Default: 3.
+        num_reconstruction_blocks (int): Number of residual blocks in reconstruction.
+            Default: 2.
+        only_last (bool): Whether the hr feature only do the last convolution.
+            Default: True.
+        use_tiny_spynet (bool): Whether use tiny spynet.
+            Default: True.
+        deform_groups (int): Number of deformable_groups in DeformConv2D in stage2 and stage3.
+            Defaults: 4.
+        stage1_groups (int): Number of deformable_groups in DeformConv2D in stage1.
+            Defaults: 8.
+        auxiliary_loss (bool): Whether use auxiliary loss.
+            Default: True.
+        use_refine_align (bool): Whether use refine align.
+            Default: True.
+        aux_reconstruction_blocks : Number of residual blocks in auxiliary reconstruction.
+            Default: 1.
+        use_local_connnect (bool): Whether add feature of stage1 after upsample.
+            Default: True.
+    """
+    def __init__(self,
+                 mid_channels=32,
+                 num_init_blocks=2,
+                 num_blocks=3,
+                 num_reconstruction_blocks=2,
+                 only_last=True,
+                 use_tiny_spynet=True,
+                 deform_groups=4,
+                 stage1_groups=8,
+                 auxiliary_loss=True,
+                 use_refine_align=True,
+                 aux_reconstruction_blocks=1,
+                 use_local_connnect=True):
+
+        super().__init__()
+
+        self.mid_channels = mid_channels
+        self.only_last = only_last
+        self.deform_groups = deform_groups
+        self.auxiliary_loss = auxiliary_loss
+        self.use_refine_align = use_refine_align
+        self.use_local_connnect = use_local_connnect
+
+        # optical flow module
+        if use_tiny_spynet:
+            self.spynet = ModifiedSPyNet(num_blocks=3, use_tiny_block=True)
+            weight_path = get_path_from_url(
+                'https://paddlegan.bj.bcebos.com/models/modified_spynet_tiny.pdparams'
+            )
+            self.spynet.set_state_dict(paddle.load(weight_path))
+        else:
+            self.spynet = ModifiedSPyNet(num_blocks=6, use_tiny_block=False)
+            weight_path = get_path_from_url(
+                'https://paddlegan.bj.bcebos.com/models/modified_spynet.pdparams'
+            )
+            self.spynet.set_state_dict(paddle.load(weight_path))
+
+        # feature extraction module
+        self.feat_extract = ResidualBlocksWithInputConv(3, mid_channels,
+                                                        num_init_blocks)
+
+        # propagation branches module for stage2 and stage3
+        self.deform_align = nn.LayerDict()
+        self.backbone = nn.LayerDict()
+
+        prop_names = [
+            'stage2_backward', 'stage2_forward', 'stage3_backward',
+            'stage3_forward'
+        ]
+
+        for i, layer in enumerate(prop_names):
+            if i > 1 and self.use_refine_align:
+                self.deform_align[layer] = ReAlignmentModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    deformable_groups=deform_groups)
+            else:
+                self.deform_align[layer] = AlignmentModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    deformable_groups=deform_groups)
+
+            self.backbone[layer] = ResidualBlocksWithInputConv(
+                (3 + i) * mid_channels, mid_channels, num_blocks)
+
+        # stage1
+        self.stage1_align = AlignmentModule(mid_channels,
+                                            mid_channels,
+                                            3,
+                                            padding=1,
+                                            deformable_groups=stage1_groups)
+        self.stage1_blocks = ResidualBlocksWithInputConv(
+            3 * mid_channels, mid_channels, 3)
+
+        # upsampling module
+        self.reconstruction = ResidualBlocksWithInputConv(
+            6 * mid_channels, mid_channels, num_reconstruction_blocks)
+
+        self.upsample1 = PixelShufflePack(mid_channels,
+                                          mid_channels,
+                                          2,
+                                          upsample_kernel=3)
+        self.upsample2 = PixelShufflePack(mid_channels,
+                                          mid_channels,
+                                          2,
+                                          upsample_kernel=3)
+        if self.only_last:
+            self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+        else:
+            self.conv_hr = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
+            self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+        self.img_upsample = nn.Upsample(scale_factor=4,
+                                        mode='bilinear',
+                                        align_corners=False)
+
+        # activation function
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+        # auxiliary loss
+        if self.auxiliary_loss:
+            self.aux_fusion = nn.Conv2D(mid_channels * 2, mid_channels, 3, 1, 1)
+
+            self.aux_reconstruction = ResidualBlocksWithInputConv(
+                4 * mid_channels, mid_channels, aux_reconstruction_blocks)
+
+            self.aux_block_down1 = nn.Sequential(
+                nn.Conv2D(3 + mid_channels, mid_channels, 3, 2, 1),
+                nn.LeakyReLU(negative_slope=0.1),
+                nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+            self.aux_block_down2 = nn.Sequential(
+                nn.Conv2D(mid_channels * 2, mid_channels, 3, 2, 1),
+                nn.LeakyReLU(negative_slope=0.1),
+                nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+
+            self.aux_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+        self.aux_upsample1 = PixelShufflePack(mid_channels,
+                                              mid_channels,
+                                              2,
+                                              upsample_kernel=3)
+        self.aux_upsample2 = PixelShufflePack(mid_channels,
+                                              mid_channels,
+                                              2,
+                                              upsample_kernel=3)
+        self.hybrid_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+    def check_if_mirror_extended(self, lrs):
+        """Check whether the input is a mirror-extended sequence.
+        If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+        (t-1-i)-th frame.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            Bool: Whether the input is a mirror-extended sequence.
+        """
+
+        with paddle.no_grad():
+            self.is_mirror_extended = False
+            if lrs.shape[1] % 2 == 0:
+                lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+                lrs_2 = paddle.flip(lrs_2, [1])
+                if paddle.norm(lrs_1 - lrs_2) == 0:
+                    self.is_mirror_extended = True
+
+    def compute_flow(self, lrs):
+        """Compute optical flow using pretrained flow network for feature alignment.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            Tuple: Tensor of forward optical flow and backward optical flow with shape (n, t-1, 2, h, w).
+        """
+        n, t, c, h, w = lrs.shape
+
+        lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+        lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+        flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+        if self.is_mirror_extended:
+            flows_forward = flows_backward.flip(1)
+        else:
+            flows_forward = self.spynet(lrs_2,
+                                        lrs_1).reshape([n, t - 1, 2, h, w])
+
+        return flows_forward, flows_backward
+
+    def stage1(self, feats, flows, flows_forward=None):
+        """Stage1 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict with key 'spatial', the value is Array of tensor after feature extraction with shape (n, c, h, w).
+            flows (tensor): Backward optical flow with shape (n, t-1, 2, h, w).
+            flows_forward (tensor): Forward optical flow with shape (n, t-1, 2, h, w).
+
+        Returns:
+            Dict: The input dict with new keys 'feat_stage1', the value of 'feat_stage1' is Array of tensor after Local Fusion Module with shape (n, c, h, w).
+        """
+
+        n, t, _, h, w = flows.shape
+
+        frame_idx = range(t, -1, -1)
+        flow_idx = range(t, -1, -1)
+        mapping_idx = list(range(0, len(feats['spatial'])))
+        mapping_idx += mapping_idx[::-1]
+
+        # Local Fusion Module
+        for i, idx in enumerate(frame_idx):
+            feat_current = feats['spatial'][mapping_idx[idx]]
+
+            # get aligned right adjacent frames
+            if i > 0:
+                feat_prop = feats['spatial'][mapping_idx[idx + 1]]
+                flow_n1 = flows[:, flow_idx[i], :, :, :]
+                cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+                cond = paddle.concat([cond_n1, feat_current], axis=1)
+                feat_prop, _, _ = self.stage1_align(feat_prop, cond, flow_n1)
+            else:
+                feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+            # get aligned left adjacent frames
+            if i < t:
+                feat_back = feats['spatial'][mapping_idx[idx - 1]]
+                flow_n1_ = flows_forward[:, flow_idx[i] - 1, :, :, :]
+                cond_n1_ = flow_warp(feat_back, flow_n1_.transpose([0, 2, 3,
+                                                                    1]))
+                cond_ = paddle.concat([cond_n1_, feat_current], axis=1)
+                feat_back, _, _ = self.stage1_align(feat_back, cond_, flow_n1_)
+            else:
+                feat_back = paddle.zeros([n, self.mid_channels, h, w])
+
+            # concatenate and residual blocks
+            feat = [feat_current] + [feat_prop] + [feat_back]
+            feat = paddle.concat(feat, axis=1)
+            feat = self.stage1_blocks(feat)
+
+            feats['feat_stage1'].append(feat)
+
+        feats['feat_stage1'] = feats['feat_stage1'][::-1]
+
+        return feats
+
+    def stage2(self, feats, flows):
+        """Stage2 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict with key 'spatial' and 'feat_stage1' after stage1.
+            flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+
+        Returns:
+            feats (dict): The input dict with new keys 'stage2_backward' and 'stage2_forward', the value of both is Array of feature after stage2 with shape (n, c, h, w).
+            pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+            pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+        """
+        flows_backward, flows_forward = flows
+        n, t, _, h, w = flows_backward.shape
+
+        pre_offset = {}
+        pre_mask = {}
+
+        # propagation branches module
+        for prop_name in ['stage2_backward', 'stage2_forward']:
+            pre_offset[prop_name] = [0 for _ in range(t)]
+            pre_mask[prop_name] = [0 for _ in range(t)]
+            feats[prop_name] = []
+            frame_idx = range(0, t + 1)
+            flow_idx = range(-1, t)
+            mapping_idx = list(range(0, len(feats['spatial'])))
+            mapping_idx += mapping_idx[::-1]
+
+            if 'backward' in prop_name:
+                frame_idx = frame_idx[::-1]
+                flow_idx = frame_idx
+                flows = flows_backward
+            else:
+                flows = flows_forward
+
+            feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+            for i, idx in enumerate(frame_idx):
+                feat_current = feats['spatial'][mapping_idx[idx]]
+
+                if i > 0:
+                    flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+                    cond_n1 = flow_warp(feat_prop,
+                                        flow_n1.transpose([0, 2, 3, 1]))
+                    cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+                    feat_prop, offset, mask = self.deform_align[prop_name](
+                        feat_prop, cond, flow_n1)
+                    pre_offset[prop_name][flow_idx[i]] = offset
+                    pre_mask[prop_name][flow_idx[i]] = (mask)
+
+                # concatenate and residual blocks
+                feat = [feat_current] + [
+                    feats[k][idx]
+                    for k in feats if k not in ['spatial', prop_name]
+                ] + [feat_prop]
+
+                feat = paddle.concat(feat, axis=1)
+                feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+                feats[prop_name].append(feat_prop)
+
+            if 'backward' in prop_name:
+                feats[prop_name] = feats[prop_name][::-1]
+
+        return feats, pre_offset, pre_mask
+
+    def stage3(self,
+               feats,
+               flows,
+               aux_feats=None,
+               pre_offset=None,
+               pre_mask=None):
+        """Stage3 of PP-MSVSR network.
+        Args:
+            feats (dict): Dict of features after stage2.
+            flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+            aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+            pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+            pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+
+        Returns:
+            feats (dict): The input feats dict with new keys 'stage3_backward' and 'stage3_forward', the value of both is Array of feature after stage3 with shape (n, c, h, w).
+            """
+        flows_backward, flows_forward = flows
+        n, t, _, h, w = flows_backward.shape
+
+        # propagation branches module
+        for prop_name in ['stage3_backward', 'stage3_forward']:
+            feats[prop_name] = []
+            frame_idx = range(0, t + 1)
+            flow_idx = range(-1, t)
+            mapping_idx = list(range(0, len(feats['spatial'])))
+            mapping_idx += mapping_idx[::-1]
+
+            if 'backward' in prop_name:
+                frame_idx = frame_idx[::-1]
+                flow_idx = frame_idx
+                flows = flows_backward
+                pre_stage_name = 'stage2_backward'
+            else:
+                flows = flows_forward
+                pre_stage_name = 'stage2_forward'
+
+            feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+            for i, idx in enumerate(frame_idx):
+                feat_current = feats['spatial'][mapping_idx[idx]]
+                if aux_feats is not None and 'feats' in aux_feats:
+                    feat_current = aux_feats['feats'][mapping_idx[idx]]
+
+                if i > 0:
+                    flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+                    cond_n1 = flow_warp(feat_prop,
+                                        flow_n1.transpose([0, 2, 3, 1]))
+                    cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+                    feat_prop = self.deform_align[prop_name](
+                        feat_prop, cond, flow_n1, feat_current,
+                        pre_offset[pre_stage_name][flow_idx[i]],
+                        pre_mask[pre_stage_name][flow_idx[i]])
+
+                # concatenate and residual blocks
+                feat = [feat_current] + [
+                    feats[k][idx]
+                    for k in feats if k not in ['spatial', prop_name]
+                ] + [feat_prop]
+
+                feat = paddle.concat(feat, axis=1)
+                feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+                feats[prop_name].append(feat_prop)
+
+            if 'backward' in prop_name:
+                feats[prop_name] = feats[prop_name][::-1]
+
+        return feats
+
+    def auxiliary_stage(self, feats, lqs):
+        """Compute the output image and auxiliary feature for Auxiliary Loss in stage2.
+        Args:
+            feats (dict): Dict of features after stage2.
+            lqs (tensor): Input LR images with shape (n, t, c, h, w)
+
+        Returns:
+            dict: Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+        """
+        aux_feats = {}
+        aux_feats['outs'] = []
+        aux_feats['feats'] = []
+        num_outputs = len(feats['spatial'])
+
+        mapping_idx = list(range(0, num_outputs))
+        mapping_idx += mapping_idx[::-1]
+
+        for i in range(0, lqs.shape[1]):
+            hr = [feats[k][i] for k in feats if (k != 'spatial')]
+            feat_current = feats['spatial'][mapping_idx[i]]
+            hr.insert(0, feat_current)
+            hr = paddle.concat(hr, axis=1)
+
+            hr_low = self.aux_reconstruction(hr)
+            hr_mid = self.lrelu(self.aux_upsample1(hr_low))
+            hr_high = self.lrelu(self.aux_upsample2(hr_mid))
+
+            hr = self.aux_conv_last(hr_high)
+            hr += self.img_upsample(lqs[:, i, :, :, :])
+
+            # output tensor of auxiliary_stage with shape (n, 3, 4*h, 4*w)
+            aux_feats['outs'].append(hr)
+
+            aux_feat = self.aux_block_down1(paddle.concat([hr, hr_high],
+                                                          axis=1))
+            aux_feat = self.aux_block_down2(
+                paddle.concat([aux_feat, hr_mid], axis=1))
+            aux_feat = self.aux_fusion(paddle.concat([aux_feat, hr_low],
+                                                     axis=1))
+
+            # out feature of auxiliary_stage with shape (n, c, h, w)
+            aux_feats['feats'].append(aux_feat)
+
+        return aux_feats
+
+    def upsample(self, lqs, feats, aux_feats=None):
+        """Compute the output image given the features.
+        Args:
+            lqs (tensor): Input LR images with shape (n, t, c, h, w).
+            feats (dict): Dict of features after stage3.
+            aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+
+        Returns:
+            Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+        """
+
+        outputs = []
+        outputs_head = []
+        num_outputs = len(feats['spatial'])
+
+        mapping_idx = list(range(0, num_outputs))
+        mapping_idx += mapping_idx[::-1]
+
+        cas_outs = []
+        pas = []
+        hrs = []
+        for i in range(0, lqs.shape[1]):
+            hr = [
+                feats[k].pop(0) for k in feats
+                if (k != 'spatial' and k != 'feat_stage1')
+            ]
+            if 'feat_stage1' in feats:
+                local_feat = feats['feat_stage1'].pop(0)
+                hr.insert(0, local_feat)
+            hr.insert(0, feats['spatial'][mapping_idx[i]])
+            hr = paddle.concat(hr, axis=1)
+
+            hr = self.reconstruction(hr)
+
+            hr = self.lrelu(self.upsample1(hr))
+            hr = self.lrelu(self.upsample2(hr))
+            if self.only_last:
+                hr = self.conv_last(hr)
+            else:
+                hr = self.lrelu(self.conv_hr(hr))
+                hr = self.conv_last(hr)
+
+            hr += self.img_upsample(lqs[:, i, :, :, :])
+            if self.use_local_connnect:
+                local_head = self.lrelu(self.aux_upsample1(local_feat))
+                local_head = self.lrelu(self.aux_upsample2(local_head))
+                hr = self.hybrid_conv_last(local_head) + hr
+
+            outputs.append(hr)
+
+        if self.auxiliary_loss:
+            return paddle.stack(aux_feats['outs'],
+                                axis=1), paddle.stack(outputs, axis=1)
+        return paddle.stack(outputs, axis=1)
+
+    def forward(self, lqs):
+        """Forward function for PP-MSVSR.
+        Args:
+            lqs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+        Returns:
+            Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+        """
+
+        n, t, c, h, w = lqs.shape
+
+        lqs_downsample = lqs
+
+        # check whether the input is an extended sequence
+        self.check_if_mirror_extended(lqs)
+
+        feats = {}
+        feats_ = self.feat_extract(lqs.reshape([-1, c, h, w]))
+
+        h, w = feats_.shape[2:]
+        feats_ = feats_.reshape([n, t, -1, h, w])
+        feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
+
+        # compute optical flow using the low-res inputs
+        assert lqs_downsample.shape[3] >= 64 and lqs_downsample.shape[4] >= 64, (
+            'The height and width of low-res inputs must be at least 64, '
+            f'but got {h} and {w}.')
+
+        flows_forward, flows_backward = self.compute_flow(lqs_downsample)
+
+        # feature propgation
+        feats['feat_stage1'] = []
+        feats = self.stage1(feats, flows_backward, flows_forward)
+
+        feats, pre_offset, pre_mask = self.stage2(
+            feats, (flows_backward, flows_forward))
+
+        if self.auxiliary_loss:
+            aux_feats = self.auxiliary_stage(feats, lqs)
+
+        feats = self.stage3(feats, (flows_backward, flows_forward), aux_feats,
+                            pre_offset, pre_mask)
+
+        return self.upsample(lqs, feats, aux_feats=aux_feats)
+
+
+class AlignmentModule(nn.Layer):
+    """deformable alignment module.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        deformable_groups (int): Number of deformable_groups in DeformConv2D.
+    """
+    def __init__(self,
+                 in_channels=128,
+                 out_channels=64,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=16):
+        super(AlignmentModule, self).__init__()
+
+        self.conv_offset = nn.Sequential(
+            nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+        )
+        self.dcn = DeformConv2D(in_channels,
+                                out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                deformable_groups=deformable_groups)
+
+        self.init_offset()
+
+    def init_offset(self):
+        constant_(self.conv_offset[-1].weight, 0)
+        constant_(self.conv_offset[-1].bias, 0)
+
+    def forward(self, x, extra_feat, flow_1):
+        extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+        out = self.conv_offset(extra_feat)
+        o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+        # offset
+        offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+        offset = offset + flow_1.flip(1).tile([1, offset.shape[1] // 2, 1, 1])
+
+        # mask
+        mask = F.sigmoid(mask)
+        out = self.dcn(x, offset, mask)
+        return out, offset, mask
+
+
+class ReAlignmentModule(nn.Layer):
+    """refine deformable alignment module.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        deformable_groups (int): Number of deformable_groups in DeformConv2D.
+    """
+    def __init__(self,
+                 in_channels=128,
+                 out_channels=64,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=16):
+        super(ReAlignmentModule, self).__init__()
+
+        self.mdconv = DeformConv2D(in_channels,
+                                   out_channels,
+                                   kernel_size=kernel_size,
+                                   stride=stride,
+                                   padding=padding,
+                                   dilation=dilation,
+                                   deformable_groups=deformable_groups)
+        self.conv_offset = nn.Sequential(
+            nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+            nn.LeakyReLU(negative_slope=0.1),
+            nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+        )
+        self.dcn = DeformConv2D(in_channels,
+                                out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                deformable_groups=deformable_groups)
+
+        self.init_offset()
+
+    def init_offset(self):
+        constant_(self.conv_offset[-1].weight, 0)
+        constant_(self.conv_offset[-1].bias, 0)
+
+    def forward(self,
+                x,
+                extra_feat,
+                flow_1,
+                feat_current,
+                pre_stage_flow=None,
+                pre_stage_mask=None):
+        if pre_stage_flow is not None:
+            pre_feat = self.mdconv(x, pre_stage_flow, pre_stage_mask)
+            extra_feat = paddle.concat([pre_feat, feat_current, flow_1], axis=1)
+        else:
+            extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+        out = self.conv_offset(extra_feat)
+        o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+        # offset
+        offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+        if pre_stage_flow is not None:
+            offset = offset + pre_stage_flow
+        else:
+            offset = offset + flow_1.flip(1).tile(
+                [1, offset.shape[1] // 2, 1, 1])
+
+        # mask
+        if pre_stage_mask is not None:
+            mask = (F.sigmoid(mask) + pre_stage_mask) / 2.0
+        else:
+            mask = F.sigmoid(mask)
+        out = self.dcn(x, offset, mask)
+        return out
+
+
+class ModifiedSPyNet(nn.Layer):
+    """Modified SPyNet network structure.
+
+    The difference to the SPyNet in paper is that
+        1. convolution with kernel_size=7 is replaced by convolution with kernel_size=3 in this version,
+        2. less SPyNetBasicModule is used in this version,
+        3. no BN is used in this version.
+
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+
+    Args:
+        act_cfg (dict): Activation function.
+            Default: dict(name='LeakyReLU').
+        num_blocks (int): Number of SPyNetBlock.
+            Default: 6.
+        use_tiny_block (bool): Whether use tiny spynet.
+            Default: True.
+    """
+    def __init__(self,
+                 act_cfg=dict(name='LeakyReLU'),
+                 num_blocks=6,
+                 use_tiny_block=False):
+        super().__init__()
+        self.num_blocks = num_blocks
+        self.basic_module = nn.LayerList([
+            SPyNetBlock(act_cfg=act_cfg, use_tiny_block=use_tiny_block)
+            for _ in range(num_blocks)
+        ])
+
+        self.register_buffer(
+            'mean',
+            paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+        self.register_buffer(
+            'std',
+            paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+
+    def compute_flow(self, ref, supp):
+        """Compute flow from ref to supp.
+
+        Note that in this function, the images are already resized to a
+        multiple of 32.
+
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+        n, _, h, w = ref.shape
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(self.num_blocks - 1):
+            ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+            supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow = paddle.to_tensor(
+            np.zeros([
+                n, 2, h // (2**(self.num_blocks - 1)), w //
+                (2**(self.num_blocks - 1))
+            ], 'float32'))
+
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(
+                    flow, scale_factor=2, mode='bilinear',
+                    align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](paddle.concat([
+                ref[level],
+                flow_warp(supp[level],
+                          flow_up.transpose([0, 2, 3, 1]),
+                          padding_mode='border'), flow_up
+            ],
+                                                                    axis=1))
+
+        return flow
+
+    def compute_flow_list(self, ref, supp):
+        n, _, h, w = ref.shape
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(self.num_blocks - 1):
+            ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+            supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow_list = []
+        flow = paddle.to_tensor(
+            np.zeros([
+                n, 2, h // (2**(self.num_blocks - 1)), w //
+                (2**(self.num_blocks - 1))
+            ], 'float32'))
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(
+                    flow, scale_factor=2, mode='bilinear',
+                    align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](paddle.concat([
+                ref[level],
+                flow_warp(supp[level],
+                          flow_up.transpose([0, 2, 3, 1]),
+                          padding_mode='border'), flow_up
+            ],
+                                                                    axis=1))
+            flow_list.append(flow)
+        return flow_list
+
+    def forward(self, ref, supp):
+        """Forward function of Modified SPyNet.
+
+        This function computes the optical flow from ref to supp.
+
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+
+        # upsize to a multiple of 32
+        h, w = ref.shape[2:4]
+        w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+        h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+        ref = F.interpolate(ref,
+                            size=(h_up, w_up),
+                            mode='bilinear',
+                            align_corners=False)
+
+        supp = F.interpolate(supp,
+                             size=(h_up, w_up),
+                             mode='bilinear',
+                             align_corners=False)
+
+        ref.stop_gradient = False
+        supp.stop_gradient = False
+
+        # compute flow, and resize back to the original resolution
+        flow = F.interpolate(self.compute_flow(ref, supp),
+                             size=(h, w),
+                             mode='bilinear',
+                             align_corners=False)
+
+        # adjust the flow values
+        flow[:, 0, :, :] *= float(w) / float(w_up)
+        flow[:, 1, :, :] *= float(h) / float(h_up)
+
+        return flow
+
+
+class SPyNetBlock(nn.Layer):
+    """Basic Block of Modified SPyNet.
+    refer to Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    """
+    def __init__(self, act_cfg=dict(name='LeakyReLU'), use_tiny_block=False):
+        super().__init__()
+        if use_tiny_block:
+            self.basic_module = nn.Sequential(
+                ConvLayer(in_channels=8,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=8,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=8,
+                          out_channels=8,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=8,
+                          out_channels=2,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=None))
+        else:
+            self.basic_module = nn.Sequential(
+                ConvLayer(in_channels=8,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=64,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=64,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=32,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=32,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=16,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=act_cfg),
+                ConvLayer(in_channels=16,
+                          out_channels=2,
+                          kernel_size=3,
+                          stride=1,
+                          padding=1,
+                          act_cfg=None))
+
+    def forward(self, tensor_input):
+        """Forward function of SPyNetBlock.
+        Args:
+            tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+                8 channels contain:
+                [reference image (3), neighbor image (3), initial flow (2)].
+
+        Returns:
+            Tensor: Refined flow with shape (b, 2, h, w)
+        """
+        return self.basic_module(tensor_input)
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 act_cfg=dict(name='ReLU')):
+        super(ConvLayer, self).__init__()
+        self.act_cfg = act_cfg
+        self.with_activation = act_cfg is not None
+
+        self.conv = nn.Conv2D(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              dilation=dilation,
+                              groups=groups)
+
+        if self.with_activation:
+            if act_cfg['name'] == 'ReLU':
+                self.act = paddle.nn.ReLU()
+            elif act_cfg['name'] == 'LeakyReLU':
+                self.act = nn.LeakyReLU(negative_slope=0.1)
+
+    def forward(self, tensor_input):
+        out = self.conv(tensor_input)
+        if self.with_activation:
+            out = self.act(out)
+        return out
diff --git a/ppgan/models/msvsr_model.py b/ppgan/models/msvsr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee6fbd3acf50983b46fa87f991a2c897875eefe
--- /dev/null
+++ b/ppgan/models/msvsr_model.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
+from .generators.msvsr import ModifiedSPyNet
+from ..modules.init import reset_parameters
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class MultiStageVSRModel(BaseSRModel):
+    """PP-MSVSR Model.
+
+    Paper:
+        PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+    """
+    def __init__(self, generator, fix_iter, pixel_criterion=None):
+        """Initialize the PP-MSVSR class.
+
+        Args:
+            generator (dict): config of generator.
+            fix_iter (dict): config of fix_iter.
+            pixel_criterion (dict): config of pixel criterion.
+        """
+        super(MultiStageVSRModel, self).__init__(generator, pixel_criterion)
+        self.fix_iter = fix_iter
+        self.current_iter = 1
+        self.flag = True
+        init_basicvsr_weight(self.nets['generator'])
+        if not self.fix_iter:
+            print('init train all parameters!!!')
+            for name, param in self.nets['generator'].named_parameters():
+                param.trainable = True
+                if 'spynet' in name:
+                    param.optimize_attr['learning_rate'] = 0.25
+
+    def setup_input(self, input):
+        self.lq = paddle.to_tensor(input['lq'])
+        self.visual_items['lq'] = self.lq[:, 0, :, :, :]
+        if 'gt' in input:
+            self.gt = paddle.to_tensor(input['gt'])
+            self.visual_items['gt'] = self.gt[:, 0, :, :, :]
+        self.image_paths = input['lq_path']
+
+    def train_iter(self, optims=None):
+        optims['optim'].clear_grad()
+        if self.fix_iter:
+            if self.current_iter == 1:
+                print('Train MSVSR with fixed spynet for', self.fix_iter,
+                      'iters.')
+                for name, param in self.nets['generator'].named_parameters():
+                    if 'spynet' in name:
+                        param.trainable = False
+            elif self.current_iter >= self.fix_iter + 1 and self.flag:
+                print('Train all the parameters.')
+                for name, param in self.nets['generator'].named_parameters():
+                    param.trainable = True
+                    if 'spynet' in name:
+                        param.optimize_attr['learning_rate'] = 0.25
+                self.flag = False
+                for net in self.nets.values():
+                    net.find_unused_parameters = False
+
+        output = self.nets['generator'](self.lq)
+        if isinstance(output, (list, tuple)):
+            out_stage2, output = output
+            loss_pix_stage2 = self.pixel_criterion(out_stage2, self.gt)
+            self.losses['loss_pix_stage2'] = loss_pix_stage2
+        self.visual_items['output'] = output[:, 0, :, :, :]
+        # pixel loss
+        loss_pix = self.pixel_criterion(output, self.gt)
+        self.losses['loss_pix'] = loss_pix
+
+        self.loss = sum(_value for _key, _value in self.losses.items()
+                        if 'loss_pix' in _key)
+        self.losses['loss'] = self.loss
+
+        self.loss.backward()
+        optims['optim'].step()
+
+        self.current_iter += 1
+
+    def test_iter(self, metrics=None):
+        self.gt = self.gt.cpu()
+        self.nets['generator'].eval()
+        with paddle.no_grad():
+            output = self.nets['generator'](self.lq)
+            if isinstance(output, (list, tuple)):
+                out_stage1, output = output
+        self.nets['generator'].train()
+
+        out_img = []
+        gt_img = []
+
+        _, t, _, _, _ = self.gt.shape
+        for i in range(t):
+            out_tensor = output[0, i]
+            gt_tensor = self.gt[0, i]
+            out_img.append(tensor2img(out_tensor, (0., 1.)))
+            gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+        if metrics is not None:
+            for metric in metrics.values():
+                metric.update(out_img, gt_img, is_seq=True)
+
+
+def init_basicvsr_weight(net):
+    for m in net.children():
+        if hasattr(m,
+                   'weight') and not isinstance(m,
+                                                (nn.BatchNorm, nn.BatchNorm2D)):
+            reset_parameters(m)
+            continue
+
+        if (not isinstance(
+                m,
+            (ResidualBlockNoBN, PixelShufflePack, SPyNet, ModifiedSPyNet))):
+            init_basicvsr_weight(m)