Merge pull request #6 from qingqing01/dain

DAIN for video frame interpolation

Merge pull request #6 from qingqing01/dain
DAIN for video frame interpolation
38701e31 · qingqing01 · GitHub · c56dbd8f · e41aff14 · 38701e31
19 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,4 +38,4 @@
        entry: python ./.copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        exclude: (?!.*third_party)^.*$
\ No newline at end of file
+        exclude: (?!.*third_party)^.*$
--- a/applications/DAIN/demo.py
+++ b/applications/DAIN/demo.py
+import os, sys
+import math
+import random
+import time
+import glob
+import shutil
+import numpy as np
+from imageio import imread, imsave
+import cv2
+
+import paddle.fluid as fluid
+
+import networks
+from util import *
+from my_args import args
+
+if __name__ == '__main__':
+
+    DO_MiddleBurryOther = True
+
+    video_path = args.video_path
+    output_path = args.output_path
+    frame_path_input = os.path.join(output_path, 'frames-input')
+    frame_path_interpolated = os.path.join(output_path, 'frames-interpolated')
+    frame_path_combined = os.path.join(output_path, 'frames-combined')
+    video_path_input = os.path.join(output_path, 'videos-input')
+    video_path_output = os.path.join(output_path, 'videos-output')
+
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    if not os.path.exists(frame_path_input):
+        os.makedirs(frame_path_input)
+    if not os.path.exists(frame_path_interpolated):
+        os.makedirs(frame_path_interpolated)
+    if not os.path.exists(frame_path_combined):
+        os.makedirs(frame_path_combined)
+    if not os.path.exists(video_path_input):
+        os.makedirs(video_path_input)
+    if not os.path.exists(video_path_output):
+        os.makedirs(video_path_output)
+
+    args.KEY_FRAME_THREAD = 0.
+    saved_model = args.saved_model
+
+    timestep = args.time_step
+    num_frames = int(1.0 / timestep) - 1
+
+    image = fluid.data(name='image',
+                       shape=[2, 1, args.channels, -1, -1],
+                       dtype='float32')
+    DAIN = networks.__dict__["DAIN_slowmotion"](channel=args.channels,
+                                                filter_size=args.filter_size,
+                                                timestep=args.time_step,
+                                                training=False)
+    out = DAIN(image)
+    out = out[0][1]
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    fetch_list = [out.name]
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+    inference_program = fluid.io.load_persistables(exe, saved_model,
+                                                   inference_program)
+
+    if not DO_MiddleBurryOther:
+        sys.exit()
+
+    if video_path.endswith('.mp4'):
+        videos = [video_path]
+    else:
+        videos = sorted(glob.glob(os.path.join(video_path, '*.mp4')))
+    for cnt, vid in enumerate(videos):
+        print("Interpolating video:", vid)
+        cap = cv2.VideoCapture(vid)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        print("Old fps (frame rate): ", fps)
+
+        timestep = args.time_step
+        times_interp = int(1.0 / timestep)
+        r2 = str(int(fps) * times_interp)
+
+        print("New fps (frame rate): ", r2)
+
+        # set start and end of video
+        #ss = 0
+        #t = 10
+        #ss = time.strftime('%H:%M:%S', time.gmtime(ss))
+        #t = time.strftime('%H:%M:%S', time.gmtime(t))
+        #print(r, ss, t)
+        r = None
+        ss = None
+        t = None
+
+        out_path = dump_frames_ffmpeg(vid, frame_path_input, r, ss, t)
+
+        vidname = vid.split('/')[-1].split('.')[0]
+
+        tot_timer = AverageMeter()
+        proc_timer = AverageMeter()
+        end = time.time()
+
+        frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
+
+        img = imread(frames[0])
+
+        int_width = img.shape[1]
+        int_height = img.shape[0]
+        channel = img.shape[2]
+        if not channel == 3:
+            continue
+
+        if int_width != ((int_width >> 7) << 7):
+            int_width_pad = (((int_width >> 7) + 1) << 7)  # more than necessary
+            padding_left = int((int_width_pad - int_width) / 2)
+            padding_right = int_width_pad - int_width - padding_left
+        else:
+            int_width_pad = int_width
+            padding_left = 32
+            padding_right = 32
+
+        if int_height != ((int_height >> 7) << 7):
+            int_height_pad = (
+                ((int_height >> 7) + 1) << 7)  # more than necessary
+            padding_top = int((int_height_pad - int_height) / 2)
+            padding_bottom = int_height_pad - int_height - padding_top
+        else:
+            int_height_pad = int_height
+            padding_top = 32
+            padding_bottom = 32
+
+        frame_num = len(frames)
+        print(os.path.join(frame_path_input, vidname, '*.png'))
+        print('processing {} frames, from video: {}'.format(frame_num, vid))
+
+        if not os.path.exists(os.path.join(frame_path_interpolated, vidname)):
+            os.makedirs(os.path.join(frame_path_interpolated, vidname))
+        if not os.path.exists(os.path.join(frame_path_combined, vidname)):
+            os.makedirs(os.path.join(frame_path_combined, vidname))
+
+        for i in range(frame_num - 1):
+            print(frames[i])
+            first = frames[i]
+            second = frames[i + 1]
+
+            img_first = imread(first)
+            img_second = imread(second)
+            '''--------------Frame change test------------------------'''
+            img_first_gray = np.dot(img_first[..., :3], [0.299, 0.587, 0.114])
+            img_second_gray = np.dot(img_second[..., :3], [0.299, 0.587, 0.114])
+
+            img_first_gray = img_first_gray.flatten(order='C')
+            img_second_gray = img_second_gray.flatten(order='C')
+            corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1]
+            key_frame = False
+            if corr < args.KEY_FRAME_THREAD:
+                key_frame = True
+            '''-------------------------------------------------------'''
+
+            X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255
+            X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255
+
+            if key_frame:
+                y_ = [
+                    np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0))
+                    for i in range(num_frames)
+                ]
+            else:
+                assert (X0.shape[1] == X1.shape[1])
+                assert (X0.shape[2] == X1.shape[2])
+
+                print("size before padding ", X0.shape)
+                X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \
+                    (padding_left, padding_right)), mode='edge')
+                X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \
+                    (padding_left, padding_right)), mode='edge')
+                print("size after padding ", X0.shape)
+
+                X0 = np.expand_dims(X0, axis=0)
+                X1 = np.expand_dims(X1, axis=0)
+
+                X0 = np.expand_dims(X0, axis=0)
+                X1 = np.expand_dims(X1, axis=0)
+
+                X = np.concatenate((X0, X1), axis=0)
+
+                proc_end = time.time()
+                o = exe.run(inference_program,
+                            fetch_list=fetch_list,
+                            feed={"image": X})
+                y_ = o[0]
+
+                proc_timer.update(time.time() - proc_end)
+                tot_timer.update(time.time() - end)
+                end = time.time()
+                print("*******current image process time \t " +
+                      str(time.time() - proc_end) + "s ******")
+
+                y_ = [
+                    np.transpose(
+                        255.0 * item.clip(
+                            0, 1.0)[0, :, padding_top:padding_top + int_height,
+                                    padding_left:padding_left + int_width],
+                        (1, 2, 0)) for item in y_
+                ]
+                time_offsets = [
+                    kk * timestep for kk in range(1, 1 + num_frames, 1)
+                ]
+
+                count = 1
+                for item, time_offset in zip(y_, time_offsets):
+                    out_dir = os.path.join(
+                        frame_path_interpolated, vidname,
+                        "{:0>4d}_{:0>4d}.png".format(i, count))
+                    count = count + 1
+                    imsave(out_dir, np.round(item).astype(np.uint8))
+
+        timestep = args.time_step
+        num_frames = int(1.0 / timestep) - 1
+
+        input_dir = os.path.join(frame_path_input, vidname)
+        interpolated_dir = os.path.join(frame_path_interpolated, vidname)
+        combined_dir = os.path.join(frame_path_combined, vidname)
+        combine_frames(input_dir, interpolated_dir, combined_dir, num_frames)
+
+        frame_pattern_combined = os.path.join(frame_path_combined, vidname,
+                                              '%08d.png')
+        video_pattern_output = os.path.join(video_path_output, vidname + '.mp4')
+        if os.path.exists(video_pattern_output):
+            os.remove(video_pattern_output)
+        frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output, r2)
--- a/applications/DAIN/my_args.py
+++ b/applications/DAIN/my_args.py
+import os
+import datetime
+import argparse
+import numpy
+import networks
+
+modelnames = networks.__all__
+# import datasets
+datasetNames = ('Vimeo_90K_interp')  #datasets.__all__
+
+parser = argparse.ArgumentParser(description='DAIN')
+
+parser.add_argument('--debug', action='store_true', help='Enable debug mode')
+parser.add_argument('--netName',
+                    type=str,
+                    default='DAIN',
+                    choices=modelnames,
+                    help='model architecture: ' + ' | '.join(modelnames) +
+                    ' (default: DAIN)')
+
+parser.add_argument('--datasetName',
+                    default='Vimeo_90K_interp',
+                    choices=datasetNames,
+                    nargs='+',
+                    help='dataset type : ' + ' | '.join(datasetNames) +
+                    ' (default: Vimeo_90K_interp)')
+parser.add_argument('--video_path',
+                    default='',
+                    help='the path of selected videos')
+parser.add_argument('--output_path', default='', help='the output root path')
+
+parser.add_argument('--seed',
+                    type=int,
+                    default=1,
+                    help='random seed (default: 1)')
+
+parser.add_argument('--batch_size',
+                    '-b',
+                    type=int,
+                    default=1,
+                    help='batch size (default:1)')
+parser.add_argument('--channels',
+                    '-c',
+                    type=int,
+                    default=3,
+                    choices=[1, 3],
+                    help='channels of images (default:3)')
+parser.add_argument('--filter_size',
+                    '-f',
+                    type=int,
+                    default=4,
+                    help='the size of filters used (default: 4)',
+                    choices=[2, 4, 6, 5, 51])
+
+parser.add_argument('--time_step',
+                    type=float,
+                    default=0.5,
+                    help='choose the time steps')
+parser.add_argument(
+    '--alpha',
+    type=float,
+    nargs='+',
+    default=[0.0, 1.0],
+    help=
+    'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])'
+)
+parser.add_argument('--frame_rate',
+                    type=int,
+                    default=None,
+                    help='frame rate of the input video')
+
+parser.add_argument('--patience',
+                    type=int,
+                    default=5,
+                    help='the patience of reduce on plateou')
+parser.add_argument('--factor',
+                    type=float,
+                    default=0.2,
+                    help='the factor of reduce on plateou')
+
+parser.add_argument('--saved_model',
+                    type=str,
+                    default='',
+                    help='path to the model weights')
+parser.add_argument('--no-date',
+                    action='store_true',
+                    help='don\'t append date timestamp to folder')
+parser.add_argument('--use_cuda',
+                    default=True,
+                    type=bool,
+                    help='use cuda or not')
+parser.add_argument('--use_cudnn', default=1, type=int, help='use cudnn or not')
+
+args = parser.parse_args()
--- a/applications/DAIN/networks/__init__.py
+++ b/applications/DAIN/networks/__init__.py
+from .dain import DAIN
+from .dain_slowmotion import DAIN_slowmotion
+__all__ = ('DAIN', 'DAIN_slowmotion')
--- a/applications/DAIN/networks/dain.py
+++ b/applications/DAIN/networks/dain.py
+import paddle.fluid as fluid
+import resblock
+import pwcnet
+
+
+class DAIN(fluid.dygraph.Layer):
+    def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True):
+        # base class initialization
+        super(DAIN, self).__init__()
+
+        self.filter_size = filter_size
+        self.training = training
+        self.timestep = timestep
+        assert (timestep == 0.5)
+        self.numFrames = int(1.0 / timestep) - 1
+
+        ctx_ch = 3 * 64 + 3
+        inplanes = 3 + 3 + 3 + 2 * 1 + 2 * 2 + 16 * 2 + 2 * ctx_ch
+
+        self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes,
+                                                                    64)
+        self.flownets = pwcnet.__dict__['pwc_dc_net']()
+        self.div_flow = 20.0
+
+    def forward(self, input):
+        """
+        Parameters
+        ----------
+        input: shape (3, batch, 3, width, height)
+        -----------
+        """
+        losses = []
+        offsets = []
+        '''
+            STEP 1: sequeeze the input
+        '''
+        if self.training == True:
+
+            assert input.shape[0] == 3
+            input_0 = input[0]
+            input_1 = input[1]
+            input_2 = input[2]
+        else:
+            # print(input.shape[0])
+            assert input.shape[0] == 2
+            input_0 = input[0]
+            input_2 = input[1]
+
+        #prepare the input data of current scale
+        cur_input_0 = input_0
+        if self.training == True:
+            cur_input_1 = input_1
+        cur_input_2 = input_2
+        '''
+            STEP 3.2: concatenating the inputs.
+        '''
+        cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2],
+                                               axis=1)
+        '''
+            STEP 3.3: perform the estimation
+        '''
+        time_offsets = [
+            kk * self.timestep for kk in range(1, 1 + self.numFrames, 1)
+        ]
+
+        cur_offset_outputs = [
+            self.forward_flownets(self.flownets,
+                                  cur_offset_input,
+                                  time_offsets=time_offsets),
+            self.forward_flownets(self.flownets,
+                                  fluid.layers.concat(
+                                      [cur_input_2, cur_input_0], axis=1),
+                                  time_offsets=time_offsets[::-1])
+        ]
+
+        cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]]
+
+        # Warp image use warp-op in PWC-Net
+        ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0])
+        ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1])
+        cur_output = (ref0 + ref2) / 2.0
+
+        rectify_input = fluid.layers.concat([
+            cur_output, ref0, ref2, cur_offset_output[0], cur_offset_output[1]
+        ],
+                                            axis=1)
+
+        cur_output_rectified = self.rectifyNet(rectify_input) + cur_output
+        '''
+            STEP 3.5: for training phase, we collect the variables to be penalized.
+        '''
+        if self.training == True:
+            losses += [cur_output - cur_input_1]
+            losses += [cur_output_rectified - cur_input_1]
+            offsets += [cur_offset_output]
+        '''
+            STEP 4: return the results
+        '''
+        if self.training == True:
+            # if in the training phase, we output the losses to be minimized.
+            # return losses, loss_occlusion
+            return losses, offsets
+        else:
+            cur_outputs = [cur_output, cur_output_rectified]
+            return cur_outputs, cur_offset_output
+
+    def forward_flownets(self, model, input, time_offsets=None):
+
+        if time_offsets == None:
+            time_offsets = [0.5]
+        elif type(time_offsets) == float:
+            time_offsets = [time_offsets]
+        elif type(time_offsets) == list:
+            pass
+        # this is a single direction motion results, but not a bidirectional one
+        temp = model(input)
+
+        # single direction to bidirection should haven it.
+        temps = [
+            self.div_flow * temp * time_offset for time_offset in time_offsets
+        ]
+        # nearest interpolation won't be better i think
+        temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps]
+        return temps
--- a/applications/DAIN/networks/dain_slowmotion.py
+++ b/applications/DAIN/networks/dain_slowmotion.py
+import paddle.fluid as fluid
+import resblock
+import time
+import pwcnet
+
+
+class DAIN_slowmotion(fluid.dygraph.Layer):
+    def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True):
+        # base class initialization
+        super(DAIN_slowmotion, self).__init__()
+
+        self.filter_size = filter_size
+        self.training = training
+        self.timestep = timestep
+        self.num_frames = int(1.0 / timestep) - 1
+
+        ctx_ch = 3 * 64 + 3
+        #        inplanes = 3 + 3 + 3 + 2*1 + 2*2 + 2
+        inplanes = 13
+
+        self.flownets = pwcnet.__dict__['pwc_dc_net']()
+        self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes,
+                                                                    64)
+        self.div_flow = 20.0
+
+    def forward(self, input):
+        """
+        Parameters
+        ----------
+        input: shape (3, batch, 3, width, height)
+        -----------
+        """
+        losses = []
+        offsets = []
+        '''
+            STEP 1: sequeeze the input
+        '''
+        if self.training == True:
+
+            assert input.shape[0] == 3
+            input_0 = input[0]
+            input_1 = input[1]
+            input_2 = input[2]
+        else:
+            assert input.shape[0] == 2
+            input_0 = input[0]
+            input_2 = input[1]
+
+        #prepare the input data of current scale
+        cur_input_0 = input_0
+        if self.training == True:
+            cur_input_1 = input_1
+        cur_input_2 = input_2
+        '''
+            STEP 3.2: concatenating the inputs.
+        '''
+        cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2],
+                                               axis=1)
+        '''
+            STEP 3.3: perform the estimation
+        '''
+        time_offsets = [
+            kk * self.timestep for kk in range(1, 1 + self.num_frames, 1)
+        ]
+
+        cur_offset_outputs = [
+            self.forward_flownets(self.flownets,
+                                  cur_offset_input,
+                                  time_offsets=time_offsets),
+            self.forward_flownets(self.flownets,
+                                  fluid.layers.concat(
+                                      [cur_input_2, cur_input_0], axis=1),
+                                  time_offsets=time_offsets[::-1])
+        ]
+        '''
+            STEP 3.4: perform the frame interpolation process
+        '''
+        count = 0
+        for temp_0, temp_1, timeoffset in zip(cur_offset_outputs[0],
+                                              cur_offset_outputs[1],
+                                              time_offsets):
+            cur_offset_output = [temp_0, temp_1]
+
+            ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0])
+            ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1])
+            cur_output_temp = (ref0 + ref2) / 2.0
+
+            if count == 0:
+                cur_output = fluid.layers.unsqueeze(cur_output_temp, axes=0)
+            else:
+                cur_output_ = fluid.layers.unsqueeze(cur_output_temp, axes=0)
+                cur_output = fluid.layers.concat([cur_output, cur_output_],
+                                                 axis=0)
+
+            rectify_input = fluid.layers.concat([
+                cur_output_temp, ref0, ref2, cur_offset_output[0],
+                cur_offset_output[1]
+            ],
+                                                axis=1)
+
+            cur_output_rectified_temp = self.rectifyNet(
+                rectify_input) + cur_output_temp
+
+            if count == 0:
+                cur_output_rectified = fluid.layers.unsqueeze(
+                    cur_output_rectified_temp, axes=0)
+            else:
+                cur_output_rectified_ = fluid.layers.unsqueeze(
+                    cur_output_rectified_temp, axes=0)
+                cur_output_rectified = fluid.layers.concat(
+                    [cur_output_rectified, cur_output_rectified_], axis=0)
+
+            count += 1
+        '''
+            STEP 3.5: for training phase, we collect the variables to be penalized.
+        '''
+        if self.training == True:
+            losses += [cur_output - cur_input_1]
+            losses += [cur_output_rectified - cur_input_1]
+            offsets += [cur_offset_output]
+        '''
+            STEP 4: return the results
+        '''
+        if self.training == True:
+            # if in the training phase, we output the losses to be minimized.
+            # return losses, loss_occlusion
+            return losses, offsets
+        else:
+            cur_outputs = [cur_output, cur_output_rectified]
+            return cur_outputs, cur_offset_output
+
+    def forward_flownets(self, model, input, time_offsets=None):
+        if time_offsets == None:
+            time_offsets = [0.5]
+        elif type(time_offsets) == float:
+            time_offsets = [time_offsets]
+        elif type(time_offsets) == list:
+            pass
+        # this is a single direction motion results, but not a bidirectional one
+        temp = model(input)
+
+        # single direction to bidirection should haven it.
+        temps = [
+            self.div_flow * temp * time_offset for time_offset in time_offsets
+        ]
+        # nearest interpolation won't be better i think
+        temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps]
+        return temps
--- a/applications/DAIN/predict.py
+++ b/applications/DAIN/predict.py
+import os, sys
+import math
+import random
+import time
+import glob
+import shutil
+import numpy as np
+from imageio import imread, imsave
+import cv2
+
+import paddle.fluid as fluid
+
+import networks
+from util import *
+from my_args import args
+
+
+def infer_engine(model_dir,
+                 run_mode='fluid',
+                 batch_size=1,
+                 use_gpu=False,
+                 min_subgraph_size=3):
+    if not use_gpu and not run_mode == 'fluid':
+        raise ValueError(
+            "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}"
+            .format(run_mode, use_gpu))
+    precision_map = {
+        'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
+        'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
+    }
+    config = fluid.core.AnalysisConfig(os.path.join(model_dir, 'model'),
+                                       os.path.join(model_dir, 'params'))
+    if use_gpu:
+        # initial GPU memory(M), device ID
+        config.enable_use_gpu(100, 0)
+        # optimize graph and fuse op
+        config.switch_ir_optim(True)
+    else:
+        config.disable_gpu()
+
+    if run_mode in precision_map.keys():
+        config.enable_tensorrt_engine(workspace_size=1 << 10,
+                                      max_batch_size=batch_size,
+                                      min_subgraph_size=min_subgraph_size,
+                                      precision_mode=precision_map[run_mode],
+                                      use_static=False,
+                                      use_calib_mode=False)
+
+    # disable print log when predict
+    config.disable_glog_info()
+    # enable shared memory
+    config.enable_memory_optim()
+    # disable feed, fetch OP, needed by zero_copy_run
+    config.switch_use_feed_fetch_ops(False)
+    predictor = fluid.core.create_paddle_predictor(config)
+    return predictor
+
+
+def executor(model_dir, use_gpu=False):
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    program, feed_names, fetch_targets = fluid.io.load_inference_model(
+        dirname=model_dir,
+        executor=exe,
+        model_filename='model',
+        params_filename='params')
+    return exe, program, fetch_targets
+
+
+class VideoFrameInterp(object):
+    def __init__(self,
+                 time_step,
+                 model_path,
+                 video_path,
+                 use_gpu=True,
+                 key_frame_thread=0.,
+                 output_path='output'):
+        self.video_path = video_path
+        self.output_path = output_path
+        self.model_path = model_path
+        self.time_step = time_step
+        self.key_frame_thread = key_frame_thread
+
+        self.exe, self.program, self.fetch_targets = executor(model_path,
+                                                              use_gpu=use_gpu)
+        # self.predictor = load_predictor(
+        #     model_dir,
+        #     run_mode=run_mode,
+        #     min_subgraph_size=3,
+        #     use_gpu=use_gpu)
+
+    def run(self):
+        frame_path_input = os.path.join(self.output_path, 'frames-input')
+        frame_path_interpolated = os.path.join(self.output_path,
+                                               'frames-interpolated')
+        frame_path_combined = os.path.join(self.output_path, 'frames-combined')
+        video_path_output = os.path.join(self.output_path, 'videos-output')
+
+        if not os.path.exists(self.output_path):
+            os.makedirs(self.output_path)
+        if not os.path.exists(frame_path_input):
+            os.makedirs(frame_path_input)
+        if not os.path.exists(frame_path_interpolated):
+            os.makedirs(frame_path_interpolated)
+        if not os.path.exists(frame_path_combined):
+            os.makedirs(frame_path_combined)
+        if not os.path.exists(video_path_output):
+            os.makedirs(video_path_output)
+
+        timestep = self.time_step
+        num_frames = int(1.0 / timestep) - 1
+
+        if self.video_path.endswith('.mp4'):
+            videos = [self.video_path]
+        else:
+            videos = sorted(glob.glob(os.path.join(self.video_path, '*.mp4')))
+
+        for cnt, vid in enumerate(videos):
+            print("Interpolating video:", vid)
+            cap = cv2.VideoCapture(vid)
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            print("Old fps (frame rate): ", fps)
+
+            times_interp = int(1.0 / timestep)
+            r2 = str(int(fps) * times_interp)
+            print("New fps (frame rate): ", r2)
+
+            out_path = dump_frames_ffmpeg(vid, frame_path_input)
+
+            vidname = vid.split('/')[-1].split('.')[0]
+
+            tot_timer = AverageMeter()
+            proc_timer = AverageMeter()
+            end = time.time()
+
+            frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
+
+            img = imread(frames[0])
+
+            int_width = img.shape[1]
+            int_height = img.shape[0]
+            channel = img.shape[2]
+            if not channel == 3:
+                continue
+
+            if int_width != ((int_width >> 7) << 7):
+                int_width_pad = (
+                    ((int_width >> 7) + 1) << 7)  # more than necessary
+                padding_left = int((int_width_pad - int_width) / 2)
+                padding_right = int_width_pad - int_width - padding_left
+            else:
+                int_width_pad = int_width
+                padding_left = 32
+                padding_right = 32
+
+            if int_height != ((int_height >> 7) << 7):
+                int_height_pad = (
+                    ((int_height >> 7) + 1) << 7)  # more than necessary
+                padding_top = int((int_height_pad - int_height) / 2)
+                padding_bottom = int_height_pad - int_height - padding_top
+            else:
+                int_height_pad = int_height
+                padding_top = 32
+                padding_bottom = 32
+
+            frame_num = len(frames)
+            print('processing {} frames, from video: {}'.format(frame_num, vid))
+
+            if not os.path.exists(os.path.join(frame_path_interpolated,
+                                               vidname)):
+                os.makedirs(os.path.join(frame_path_interpolated, vidname))
+            if not os.path.exists(os.path.join(frame_path_combined, vidname)):
+                os.makedirs(os.path.join(frame_path_combined, vidname))
+
+            for i in range(frame_num - 1):
+                print(frames[i])
+                first = frames[i]
+                second = frames[i + 1]
+
+                img_first = imread(first)
+                img_second = imread(second)
+                '''--------------Frame change test------------------------'''
+                img_first_gray = np.dot(img_first[..., :3],
+                                        [0.299, 0.587, 0.114])
+                img_second_gray = np.dot(img_second[..., :3],
+                                         [0.299, 0.587, 0.114])
+
+                img_first_gray = img_first_gray.flatten(order='C')
+                img_second_gray = img_second_gray.flatten(order='C')
+                corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1]
+                key_frame = False
+                if corr < self.key_frame_thread:
+                    key_frame = True
+                '''-------------------------------------------------------'''
+
+                X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255
+                X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255
+
+                if key_frame:
+                    y_ = [
+                        np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0))
+                        for i in range(num_frames)
+                    ]
+                else:
+                    assert (X0.shape[1] == X1.shape[1])
+                    assert (X0.shape[2] == X1.shape[2])
+
+                    print("size before padding ", X0.shape)
+                    X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \
+                        (padding_left, padding_right)), mode='edge')
+                    X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \
+                        (padding_left, padding_right)), mode='edge')
+                    print("size after padding ", X0.shape)
+
+                    X0 = np.expand_dims(X0, axis=0)
+                    X1 = np.expand_dims(X1, axis=0)
+
+                    X0 = np.expand_dims(X0, axis=0)
+                    X1 = np.expand_dims(X1, axis=0)
+
+                    X = np.concatenate((X0, X1), axis=0)
+
+                    proc_end = time.time()
+                    o = self.exe.run(self.program,
+                                     fetch_list=self.fetch_targets,
+                                     feed={"image": X})
+
+                    y_ = o[0]
+
+                    proc_timer.update(time.time() - proc_end)
+                    tot_timer.update(time.time() - end)
+                    end = time.time()
+                    print("*********** current image process time \t " +
+                          str(time.time() - proc_end) + "s *********")
+
+                    y_ = [
+                        np.transpose(
+                            255.0 * item.clip(
+                                0, 1.0)[0, :,
+                                        padding_top:padding_top + int_height,
+                                        padding_left:padding_left + int_width],
+                            (1, 2, 0)) for item in y_
+                    ]
+                    time_offsets = [
+                        kk * timestep for kk in range(1, 1 + num_frames, 1)
+                    ]
+
+                    count = 1
+                    for item, time_offset in zip(y_, time_offsets):
+                        out_dir = os.path.join(
+                            frame_path_interpolated, vidname,
+                            "{:0>4d}_{:0>4d}.png".format(i, count))
+                        count = count + 1
+                        imsave(out_dir, np.round(item).astype(np.uint8))
+
+            num_frames = int(1.0 / timestep) - 1
+
+            input_dir = os.path.join(frame_path_input, vidname)
+            interpolated_dir = os.path.join(frame_path_interpolated, vidname)
+            combined_dir = os.path.join(frame_path_combined, vidname)
+            combine_frames(input_dir, interpolated_dir, combined_dir,
+                           num_frames)
+
+            frame_pattern_combined = os.path.join(frame_path_combined, vidname,
+                                                  '%08d.png')
+            video_pattern_output = os.path.join(video_path_output,
+                                                vidname + '.mp4')
+            if os.path.exists(video_pattern_output):
+                os.remove(video_pattern_output)
+            frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output,
+                                   r2)
+
+
+if __name__ == '__main__':
+    predictor = VideoFrameInterp(args.time_step, args.saved_model,
+                                 args.video_path, args.output_path)
+    predictor.run()
--- a/applications/DAIN/pwcnet/__init__.py
+++ b/applications/DAIN/pwcnet/__init__.py
+from .pwcnet import *
--- a/applications/DAIN/pwcnet/correlation_op/README.md
+++ b/applications/DAIN/pwcnet/correlation_op/README.md
+自定义OP编译:
+2. sh make.sh编译成correlation_lib.so动态库
+3. 添加动态库路径到LD_LIBRARY_PATH：
+```
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python3.7 -c 'import paddle; print(paddle.sysconfig.get_lib())'`
+```
+4. 添加correlation op的python路径:
+```
+export PYTHONPATH=$PYTHONPATH:`pwd`
+```
+5. python test_correlation.py运行单测，验证是否加载成功。
+
+PS: 如果paddle whl包是从官网上下载的，需要使用gcc 4.8，即把make.sh中的g++ 改为 g++-4.8
--- a/applications/DAIN/pwcnet/correlation_op/correlation.py
+++ b/applications/DAIN/pwcnet/correlation_op/correlation.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+import os
+file_dir = os.path.dirname(os.path.abspath(__file__))
+fluid.load_op_library(os.path.join(file_dir, 'correlation_lib.so'))
+
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def correlation(input1,
+                input2,
+                pad_size,
+                kernel_size,
+                max_displacement,
+                stride1,
+                stride2,
+                corr_type_multiply=1):
+    helper = LayerHelper("correlation", **locals())
+    output = helper.create_variable_for_type_inference(dtype=input1.dtype)
+    helper.append_op(type="correlation",
+                     inputs={
+                         "Input1": input1,
+                         "Input2": input2
+                     },
+                     attrs={
+                         "pad_size": pad_size,
+                         "kernel_size": kernel_size,
+                         "max_displacement": max_displacement,
+                         "stride1": stride1,
+                         "stride2": stride2,
+                         "corr_type_multiply": corr_type_multiply
+                     },
+                     outputs={"Output": output})
+    return output
--- a/applications/DAIN/pwcnet/correlation_op/correlation_op.cc
+++ b/applications/DAIN/pwcnet/correlation_op/correlation_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height, int input_width, int stride1, int stride2, int kernel_size, int pad_size, int max_displacement) {
+
+  std::vector<int64_t> output_shape({batch});
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_radius = kernel_radius + max_displacement;
+  int padded_input_height = input_height + 2 * pad_size;
+  int padded_input_width = input_width + 2 * pad_size;
+  int output_channel = ((max_displacement/stride2) * 2 + 1) * ((max_displacement/stride2) * 2 + 1);
+  output_shape.push_back(output_channel);
+  int output_height = std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) / static_cast<float>(stride1)); 
+  int output_width = std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) / static_cast<float>(stride1));
+  output_shape.push_back(output_height);
+  output_shape.push_back(output_width);
+  return output_shape;
+}
+
+class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override{
+    AddInput("Input1", "input1");
+    AddInput("Input2", "input2");
+    AddOutput("Output", "output");
+    AddAttr<int>("pad_size", "pad size for input1 and input2");
+    AddAttr<int>("kernel_size", "kernel size of input1 and input2");
+    AddAttr<int>("max_displacement", "max displacement of input1 and input2");
+    AddAttr<int>("stride1", "Input1 stride");
+    AddAttr<int>("stride2", "Input2 stride");
+    AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
+    AddComment(R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
+  }
+};
+
+class CorrelationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override{
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(input1) cannot be null");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(input2) cannot be null");
+    int stride1 = ctx->Attrs().Get<int>("stride1");
+    int stride2 = ctx->Attrs().Get<int>("stride2");
+    int max_displacement = ctx->Attrs().Get<int>("max_displacement");
+    int pad_size = ctx->Attrs().Get<int>("pad_size");
+    int kernel_size = ctx->Attrs().Get<int>("kernel_size");
+
+    auto in_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+    PADDLE_ENFORCE_EQ(in_dims.size() == 4, true, "input1 must be 4-dims");
+    PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true, "input2 must be 4-dims");
+    std::vector<int64_t> output_shape = CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1, stride2, kernel_size, pad_size, max_displacement);
+    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override{
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
+    PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(), "Input1 and Input2 shoule have same type");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("correlation_grad");
+    op->SetInput("Input1", this->Input("Input1"));
+    op->SetInput("Input2", this->Input("Input2"));
+    op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
+    op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CorrelationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override{
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(Input1) should not be null");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(Input2) should not be null");
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Output")), true, "Input(Output@GRAD) should not be null");
+
+    auto in1_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+    ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Input2"), in1_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override{
+    const auto* var = ctx.InputVar(framework::GradVarName("Output"));
+    if (var == nullptr) {
+      PADDLE_THROW("cannot find Output@GRAD");
+    }
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
+  }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
+          ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
+          ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
--- a/applications/DAIN/pwcnet/correlation_op/correlation_op.cu
+++ b/applications/DAIN/pwcnet/correlation_op/correlation_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+#define THREADS_PER_BLOCK 32
+#define FULL_MASK 0xffffffff
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__forceinline__ __device__ T warpReduceSum(T val) {
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = warpReduceSum(val);
+  if (lane == 0)
+    shared[wid] = val;
+
+  __syncthreads();
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (wid == 0)
+    val = warpReduceSum(val);
+
+  return val;
+}
+
+template <typename T>
+__global__ void set_zero(T *x, int num) {
+  for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x)
+    x[i] = static_cast<T>(0);
+}
+
+template <typename T>
+__global__ void channel_first(const T *input, T *rinput, const int channel, const int height, const int width, const int pad_size) {
+  int n = blockIdx.x;
+  int h = blockIdx.y;
+  int w = blockIdx.z;
+
+  int ch_off = threadIdx.x;
+  T value;
+  int dimchw = channel * height * width;
+  int dimhw = height * width;
+
+  int p_dimw = (width + 2 * pad_size);
+  int p_dimh = (height + 2 * pad_size);
+  int p_dimchw = channel * p_dimw * p_dimh;
+  int p_dimcw = channel * p_dimw;
+
+  for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
+    value = input[n * dimchw + c * dimhw + h * width + w];
+    rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel + c] = value;
+  }
+}
+
+template <typename T>
+__global__ void correlation_forward(T *output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int input_channel, const int input_height, const int input_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) {
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int n = blockIdx.x;
+  int h1 = blockIdx.y * stride1 + max_displacement;
+  int w1 = blockIdx.z * stride1 + max_displacement;
+  int c = threadIdx.x;
+
+  int p_dimchw = p_input_height * p_input_width * input_channel;
+  int p_dimcw = p_input_width * input_channel;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int nelems = kernel_size * kernel_size * p_dimc;
+
+  for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
+    for(int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
+      int w2 = w1 + ti * stride2;
+      int h2 = h1 + tj * stride2;
+
+      T acc0 = 0;
+      for(int j = -kernel_rad; j <= kernel_rad; ++j) {
+        for(int i = -kernel_rad; i <= kernel_rad; ++i) {
+          for(int ch = c; ch < p_dimc; ch += blockDim.x) {
+            int index1 = n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
+            int index2 = n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
+            acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
+          } 
+        }
+      }
+      if (blockDim.x == warpSize) {
+        __syncwarp();
+        acc0 = warpReduceSum(acc0);
+      } else {
+        __syncthreads();
+        acc0 = blockReduceSum(acc0);
+      }
+
+      if (threadIdx.x == 0) {
+        int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad);
+        const int t_index = n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
+        output[t_index] = static_cast<T>(acc0 / nelems);
+      }
+    }
+  }
+
+}
+
+//class CorrelationKernel<platform::CUDADeviceContext, T>
+template <typename T>
+class CorrelationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must be CUDAPlace");
+
+    auto *input1 = ctx.Input<Tensor>("Input1");
+    auto *input2 = ctx.Input<Tensor>("Input2");
+    int pad_size = ctx.Attr<int>("pad_size");
+    int kernel_size = ctx.Attr<int>("kernel_size");
+    int stride1 = ctx.Attr<int>("stride1");
+    int stride2 = ctx.Attr<int>("stride2");
+    int max_displacement = ctx.Attr<int>("max_displacement");
+    int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // base on input1, NCHW
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(output->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(output->data<T>(), output->numel());
+
+    auto out_dims = output->dims();
+    int OC = out_dims[1];
+    int OH = out_dims[2];
+    int OW = out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(N, OH, OW);
+
+    correlation_forward<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(output->data<T>(), OC, OH, OW, rinput1.data<T>(),
+C, H, W, rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
+  }
+};
+
+template <typename T>
+__global__ void correlation_backward_input1(int item, T *grad_input1, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) {
+
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int xmin = (w - kernel_rad - max_displacement) / stride1;
+  int ymin = (h - kernel_rad - max_displacement) / stride1;
+
+  int xmax = (w + kernel_rad - max_displacement) / stride1;
+  int ymax = (h + kernel_rad - max_displacement) / stride1;
+
+  if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+    return;
+  }
+
+  if (xmin > xmax || ymin > ymax) {
+    return;
+  }
+
+  xmin = max(0, xmin);
+  xmax = min(output_width - 1, xmax);
+
+  ymin = max(0, ymin);
+  ymax = min(output_height - 1, ymax);
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
+
+    T val2 = rinput2[index2];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val2;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index1 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
+  }
+
+}
+
+template <typename T>
+__global__ void correlation_backward_input2(int item, T *grad_input2, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2){
+
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
+    int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
+
+    int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
+    int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
+
+    if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+      continue;
+    }
+
+    if (xmin > xmax || ymin > ymax) {
+      continue;
+    }
+
+    xmin = max(0, xmin);
+    xmax = min(output_width - 1, xmax);
+
+    ymin = max(0, ymin);
+    ymax = min(output_height - 1, ymax);
+
+    int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
+    T val1 = rinput1[index1];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val1;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index2 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+class CorrelationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must use CUDAPlace.");
+    const auto *input1 = ctx.Input<Tensor>("Input1");
+    const auto *input2 = ctx.Input<Tensor>("Input2");
+    const auto *grad_output = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    const int pad_size = ctx.Attr<int>("pad_size");
+    const int kernel_size = ctx.Attr<int>("kernel_size");
+    const int stride1 = ctx.Attr<int>("stride1");
+    const int stride2 = ctx.Attr<int>("stride2");
+    const int max_displacement = ctx.Attr<int>("max_displacement");
+    const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
+    grad_input1->mutable_data<T>(ctx.GetPlace());
+    auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
+    grad_input2->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+    
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(grad_input1->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input1->data<T>(), grad_input1->numel());
+    set_zero<<<(grad_input2->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input2->data<T>(), grad_input2->numel());
+
+    auto grad_out_dims = grad_output->dims();
+    int GOC = grad_out_dims[1];
+    int GOH = grad_out_dims[2];
+    int GOW = grad_out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+    
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(H, W, C);
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input1<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH, GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
+    }
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input2<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH, GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
+    }
+  }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    correlation, ops::CorrelationKernel<float>,
+    ops::CorrelationKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    correlation_grad, ops::CorrelationGradKernel<float>,
+    ops::CorrelationGradKernel<double>);
+
--- a/applications/DAIN/pwcnet/correlation_op/make.sh
+++ b/applications/DAIN/pwcnet/correlation_op/make.sh
+# source /ssd1/vis/liufanglong/.bashrc
+#export PATH=/home/work/cuda-9.0/bin:$PATH
+#export PATH=/home/work/cuda-9.0/bin:$PATH
+#export LD_LIBRARY_PATH="/home/work/cuda-9.0/lib64:$LD_LIBRARY_PATH"
+#export LD_LIBRARY_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/lib64:$LD_LIBRARY_PATH
+#export CPLUS_INCLUDE_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/include:/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/include:$CPLUS_INCLUDE_PATH
+#export LD_LIBRARY_PATH=/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/lib:$LD_LIBRARY_PATH
+
+include_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_include())' )
+lib_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_lib())' )
+
+echo $include_dir
+echo $lib_dir
+
+OPS='correlation_op'
+for op in ${OPS}
+do
+nvcc ${op}.cu -c -o ${op}.cu.o -ccbin cc -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO -DPADDLE_WITH_MKLDNN -Xcompiler -fPIC -std=c++11 -Xcompiler -fPIC -w --expt-relaxed-constexpr -O0 -g -DNVCC \
+    -I ${include_dir}/third_party/ \
+    -I ${include_dir}
+done
+
+# g++-4.8 correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
+# g++ ${OPS}.cu.o ${OPS}.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
+g++ correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
+  -I ${include_dir}/third_party/ \
+  -I ${include_dir} \
+  -L ${lib_dir} \
+  -L /usr/local/cuda/lib64/ -lpaddle_framework -lcudart
+
+# rm *.cu.o
--- a/applications/DAIN/pwcnet/correlation_op/test_correlation.py
+++ b/applications/DAIN/pwcnet/correlation_op/test_correlation.py
+import unittest
+from correlation import correlation
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+
+
+def corr(x_1,
+         x_2,
+         pad_size=4,
+         kernel_size=1,
+         max_displacement=4,
+         stride1=1,
+         stride2=1,
+         corr_multiply=1):
+    K = kernel_size
+    # rinput1 = np.pad(x_1, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3)
+    # rinput2 = np.pad(x_2, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3)
+
+    rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput1 = np.transpose(rinput1, (0, 2, 3, 1))
+    rinput2 = np.transpose(rinput2, (0, 2, 3, 1))
+    B = int(rinput1.shape[0])
+    H = int(x_1.shape[2])
+    W = int(x_2.shape[3])
+    d = max_displacement
+    D = 2 * d + 1
+    output = np.zeros((B, D * D, H, W), dtype=np.float32)
+
+    for b in range(B):
+        for i in range(H):
+            for j in range(W):
+                for k in range(-d, d + 1):
+                    for l in range(-d, d + 1):
+                        x1_index = i + pad_size
+                        y1_index = j + pad_size
+                        x2_index = x1_index + k
+                        y2_index = y1_index + l
+                        output[b, l + d + D * (k + d), i,
+                               j] = np.mean(rinput1[b, x1_index:x1_index + K,
+                                                    y1_index:y1_index + K] *
+                                            rinput2[b, x2_index:x2_index + K,
+                                                    y2_index:y2_index + K])
+
+    return output
+
+
+class TestCorrelationOp(unittest.TestCase):
+    def test_check_output(self):
+        #x_shape = (1, 196, 3, 3)
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        x1 = fluid.layers.data(name='x1',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False)
+        x2 = fluid.layers.data(name='x2',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False)
+
+        x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        out_np = corr(x1_np,
+                      x2_np,
+                      pad_size=4,
+                      kernel_size=1,
+                      max_displacement=4,
+                      stride1=1,
+                      stride2=1)
+
+        out = correlation(x1,
+                          x2,
+                          pad_size=4,
+                          kernel_size=1,
+                          max_displacement=4,
+                          stride1=1,
+                          stride2=1)
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        res = exe.run(feed={'x1': x1_np, 'x2': x2_np}, fetch_list=[out.name])
+
+        self.assertTrue(np.allclose(res[0], out_np))
+
+
+class Net(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(Net, self).__init__(name_scope)
+
+    def forward(self, x1, x2):
+        y = correlation(x1,
+                        x2,
+                        pad_size=4,
+                        kernel_size=1,
+                        max_displacement=4,
+                        stride1=1,
+                        stride2=1)
+        return y
+
+
+class TestCorrelationOpDyGraph(unittest.TestCase):
+    def test_check_output(self):
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        place = fluid.CUDAPlace(0)
+        with fluid.dygraph.guard(place):
+            x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            out_np = corr(x1_np,
+                          x2_np,
+                          pad_size=4,
+                          kernel_size=1,
+                          max_displacement=4,
+                          stride1=1,
+                          stride2=1)
+
+            x1 = to_variable(x1_np)
+            x2 = to_variable(x2_np)
+            corr_pd = Net('corr_pd')
+            y = corr_pd(x1, x2)
+            out = y.numpy()
+            self.assertTrue(np.allclose(out, out_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/applications/DAIN/pwcnet/pwcnet.py
+++ b/applications/DAIN/pwcnet/pwcnet.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Conv2D, Conv2DTranspose
+
+from .correlation_op.correlation import correlation
+
+__all__ = ['pwc_dc_net']
+
+
+class PWCDCNet(fluid.dygraph.Layer):
+    def __init__(self, md=4):
+        super(PWCDCNet, self).__init__()
+        self.md = md
+        self.param_attr = fluid.ParamAttr(
+            regularizer=fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=0.0004),
+            initializer=fluid.initializer.MSRAInitializer(uniform=True,
+                                                          fan_in=None,
+                                                          seed=0))
+        self.conv1a = Conv2D(3, 16, 3, 2, 1, param_attr=self.param_attr)
+        self.conv1aa = Conv2D(16, 16, 3, 1, 1, param_attr=self.param_attr)
+        self.conv1b = Conv2D(16, 16, 3, 1, 1, param_attr=self.param_attr)
+        self.conv2a = Conv2D(16, 32, 3, 2, 1, param_attr=self.param_attr)
+        self.conv2aa = Conv2D(32, 32, 3, 1, 1, param_attr=self.param_attr)
+        self.conv2b = Conv2D(32, 32, 3, 1, 1, param_attr=self.param_attr)
+        self.conv3a = Conv2D(32, 64, 3, 2, 1, param_attr=self.param_attr)
+        self.conv3aa = Conv2D(64, 64, 3, 1, 1, param_attr=self.param_attr)
+        self.conv3b = Conv2D(64, 64, 3, 1, 1, param_attr=self.param_attr)
+        self.conv4a = Conv2D(64, 96, 3, 2, 1, param_attr=self.param_attr)
+        self.conv4aa = Conv2D(96, 96, 3, 1, 1, param_attr=self.param_attr)
+        self.conv4b = Conv2D(96, 96, 3, 1, 1, param_attr=self.param_attr)
+        self.conv5a = Conv2D(96, 128, 3, 2, 1, param_attr=self.param_attr)
+        self.conv5aa = Conv2D(128, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv5b = Conv2D(128, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv6aa = Conv2D(128, 196, 3, 2, 1, param_attr=self.param_attr)
+        self.conv6a = Conv2D(196, 196, 3, 1, 1, param_attr=self.param_attr)
+        self.conv6b = Conv2D(196, 196, 3, 1, 1, param_attr=self.param_attr)
+
+        nd = (2 * self.md + 1)**2
+        dd = np.cumsum([128, 128, 96, 64, 32], dtype=np.int32).astype(np.int)
+        dd = [int(d) for d in dd]
+        od = nd
+        self.conv6_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv6_1 = Conv2D(od + dd[0],
+                              128,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv6_2 = Conv2D(od + dd[1],
+                              96,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv6_3 = Conv2D(od + dd[2],
+                              64,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv6_4 = Conv2D(od + dd[3],
+                              32,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.predict_flow6 = Conv2D(od + dd[4],
+                                    2,
+                                    3,
+                                    1,
+                                    1,
+                                    param_attr=self.param_attr)
+        self.deconv6 = Conv2DTranspose(2,
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+        self.upfeat6 = Conv2DTranspose(od + dd[4],
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+
+        od = nd + 128 + 4
+        self.conv5_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv5_1 = Conv2D(od + dd[0],
+                              128,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv5_2 = Conv2D(od + dd[1],
+                              96,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv5_3 = Conv2D(od + dd[2],
+                              64,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv5_4 = Conv2D(od + dd[3],
+                              32,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.predict_flow5 = Conv2D(od + dd[4],
+                                    2,
+                                    3,
+                                    1,
+                                    1,
+                                    param_attr=self.param_attr)
+        self.deconv5 = Conv2DTranspose(2,
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+        self.upfeat5 = Conv2DTranspose(od + dd[4],
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+
+        od = nd + 96 + 4
+        self.conv4_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv4_1 = Conv2D(od + dd[0],
+                              128,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv4_2 = Conv2D(od + dd[1],
+                              96,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv4_3 = Conv2D(od + dd[2],
+                              64,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv4_4 = Conv2D(od + dd[3],
+                              32,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.predict_flow4 = Conv2D(od + dd[4],
+                                    2,
+                                    3,
+                                    1,
+                                    1,
+                                    param_attr=self.param_attr)
+        self.deconv4 = Conv2DTranspose(2,
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+        self.upfeat4 = Conv2DTranspose(od + dd[4],
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+
+        od = nd + 64 + 4
+        self.conv3_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv3_1 = Conv2D(od + dd[0],
+                              128,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv3_2 = Conv2D(od + dd[1],
+                              96,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv3_3 = Conv2D(od + dd[2],
+                              64,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv3_4 = Conv2D(od + dd[3],
+                              32,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.predict_flow3 = Conv2D(od + dd[4],
+                                    2,
+                                    3,
+                                    1,
+                                    1,
+                                    param_attr=self.param_attr)
+        self.deconv3 = Conv2DTranspose(2,
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+        self.upfeat3 = Conv2DTranspose(od + dd[4],
+                                       2,
+                                       4,
+                                       stride=2,
+                                       padding=1,
+                                       param_attr=self.param_attr)
+
+        od = nd + 32 + 4
+        self.conv2_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr)
+        self.conv2_1 = Conv2D(od + dd[0],
+                              128,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv2_2 = Conv2D(od + dd[1],
+                              96,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv2_3 = Conv2D(od + dd[2],
+                              64,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.conv2_4 = Conv2D(od + dd[3],
+                              32,
+                              3,
+                              1,
+                              1,
+                              param_attr=self.param_attr)
+        self.predict_flow2 = Conv2D(od + dd[4],
+                                    2,
+                                    3,
+                                    1,
+                                    1,
+                                    param_attr=self.param_attr)
+        #        self.deconv2 = Conv2DTranspose(2, 2, 4, stride=2, padding=1, param_attr=self.param_attr)
+
+        self.dc_conv1 = Conv2D(od + dd[4],
+                               128,
+                               3,
+                               1,
+                               1,
+                               dilation=1,
+                               param_attr=self.param_attr)
+        self.dc_conv2 = Conv2D(128,
+                               128,
+                               3,
+                               1,
+                               2,
+                               dilation=2,
+                               param_attr=self.param_attr)
+        self.dc_conv3 = Conv2D(128,
+                               128,
+                               3,
+                               1,
+                               4,
+                               dilation=4,
+                               param_attr=self.param_attr)
+        self.dc_conv4 = Conv2D(128,
+                               96,
+                               3,
+                               1,
+                               8,
+                               dilation=8,
+                               param_attr=self.param_attr)
+        self.dc_conv5 = Conv2D(96,
+                               64,
+                               3,
+                               1,
+                               16,
+                               dilation=16,
+                               param_attr=self.param_attr)
+        self.dc_conv6 = Conv2D(64,
+                               32,
+                               3,
+                               1,
+                               1,
+                               dilation=1,
+                               param_attr=self.param_attr)
+        self.dc_conv7 = Conv2D(32, 2, 3, 1, 1, param_attr=self.param_attr)
+
+    def warp(self, x, flo):
+        """
+        warp an image/tensor (im2) back to im1, according to the optical flow
+        x: [B, C, H, W] (im2)
+        flo: [B, 2, H, W] flow
+        """
+        x_shape = fluid.layers.shape(x)
+        B, H, W = x_shape[0], x_shape[2], x_shape[3]
+        bb = fluid.layers.range(0, B, 1, 'float32')
+        xx = fluid.layers.range(0, W, 1, 'float32')
+        yy = fluid.layers.range(0, H, 1, 'float32')
+        _, yy, xx = paddle.tensor.meshgrid(bb, yy, xx)
+        yy = fluid.layers.unsqueeze(yy, [1])
+        xx = fluid.layers.unsqueeze(xx, [1])
+        grid = fluid.layers.concat(input=[xx, yy], axis=1)
+        flo = flo
+        vgrid = fluid.layers.elementwise_add(grid, flo)
+
+        vgrid_0 = 2.0 * fluid.layers.slice(
+            vgrid, axes=[1], starts=[0], ends=[1]) / (W - 1.) - 1.0
+        vgrid_1 = 2.0 * fluid.layers.slice(
+            vgrid, axes=[1], starts=[1], ends=[2]) / (H - 1.) - 1.0
+
+        vgrid = fluid.layers.concat(input=[vgrid_0, vgrid_1], axis=1)
+        vgrid = fluid.layers.transpose(vgrid, [0, 2, 3, 1])
+        output = fluid.layers.grid_sampler(name='grid_sample', x=x, grid=vgrid)
+
+        mask = fluid.layers.zeros_like(x)
+        mask = mask + 1.0
+        mask = fluid.layers.grid_sampler(name='grid_sample', x=mask, grid=vgrid)
+        mask_temp1 = fluid.layers.cast(mask < 0.9990, 'float32')
+        mask = mask * (1 - mask_temp1)
+        mask = fluid.layers.cast(mask > 0, 'float32')
+        outwarp = fluid.layers.elementwise_mul(output, mask)
+
+        return outwarp
+
+    def warp_nomask(self, x, flo):
+        """
+        warp an image/tensor (im2) back to im1, according to the optical flow
+        x: [B, C, H, W] (im2)
+        flo: [B, 2, H, W] flow
+        """
+
+        B, C, H, W = x.shape
+        # mesh grid
+        #        xx = fluid.layers.range(0, W, 1, 'float32')
+        #        xx = fluid.layers.reshape(xx, shape=[1, -1])
+        #        xx = fluid.layers.expand(x=xx, expand_times=[H, 1])
+        #        xx = fluid.layers.reshape(xx, shape=[1, 1, H, W])
+        #        xx = fluid.layers.expand(x=xx, expand_times=[B, 1, 1, 1])
+        #
+        #        yy = fluid.layers.range(0, H, 1, 'float32')
+        #        yy = fluid.layers.reshape(yy, shape=[-1, 1])
+        #        yy = fluid.layers.expand(x=yy, expand_times=[1, W])
+        #        yy = fluid.layers.reshape(x=yy, shape=[1, 1, H, W])
+        #        yy = fluid.layers.expand(x=yy, expand_times=[B, 1, 1, 1])
+
+        x_shape = fluid.layers.shape(x)
+        B, H, W = x_shape[0], x_shape[2], x_shape[3]
+        bb = fluid.layers.range(0, B, 1, 'float32')
+        xx = fluid.layers.range(0, W, 1, 'float32')
+        #        xx = fluid.layers.reshape(xx, shape=[1, -1])
+        yy = fluid.layers.range(0, H, 1, 'float32')
+        #        yy = fluid.layers.reshape(yy, shape=[1, -1])
+        _, yy, xx = paddle.tensor.meshgrid(bb, yy, xx)
+        yy = fluid.layers.unsqueeze(yy, [1])
+        xx = fluid.layers.unsqueeze(xx, [1])
+
+        grid = fluid.layers.concat(input=[xx, yy], axis=1)
+        flo = flo
+        vgrid = fluid.layers.elementwise_add(grid, flo)
+        #vgrid_0 = 2.0 * fluid.layers.slice(vgrid, axes=[1], starts=[0], ends=[1]) / max(W - 1, 1) - 1.0
+        #vgrid_1 = 2.0 * fluid.layers.slice(vgrid, axes=[1], starts=[1], ends=[2]) / max(H - 1, 1) - 1.0
+        vgrid_0 = 2.0 * fluid.layers.slice(
+            vgrid, axes=[1], starts=[0], ends=[1]) / (W - 1.) - 1.0
+        vgrid_1 = 2.0 * fluid.layers.slice(
+            vgrid, axes=[1], starts=[1], ends=[2]) / (H - 1.) - 1.0
+        vgrid = fluid.layers.concat(input=[vgrid_0, vgrid_1], axis=1)
+        vgrid = fluid.layers.transpose(vgrid, [0, 2, 3, 1])
+        output = fluid.layers.grid_sampler(name='grid_sample', x=x, grid=vgrid)
+
+        return output
+
+    def corr(self, x_1, x_2):
+        out = correlation(x_1,
+                          x_2,
+                          pad_size=self.md,
+                          kernel_size=1,
+                          max_displacement=self.md,
+                          stride1=1,
+                          stride2=1,
+                          corr_type_multiply=1)
+        return out
+
+    def forward(self, x, output_more=False):
+        im1 = fluid.layers.slice(x, axes=[1], starts=[0], ends=[3])
+        im2 = fluid.layers.slice(x, axes=[1], starts=[3], ends=[6])
+        # print("\n\n********************PWC Net details *************** \n\n")
+        c11 = fluid.layers.leaky_relu(self.conv1a(im1), 0.1)
+        c11 = fluid.layers.leaky_relu(self.conv1aa(c11), 0.1)
+        c11 = fluid.layers.leaky_relu(self.conv1b(c11), 0.1)
+
+        c21 = fluid.layers.leaky_relu(self.conv1a(im2), 0.1)
+        c21 = fluid.layers.leaky_relu(self.conv1aa(c21), 0.1)
+        c21 = fluid.layers.leaky_relu(self.conv1b(c21), 0.1)
+        c12 = fluid.layers.leaky_relu(self.conv2a(c11), 0.1)
+        c12 = fluid.layers.leaky_relu(self.conv2aa(c12), 0.1)
+        c12 = fluid.layers.leaky_relu(self.conv2b(c12), 0.1)
+
+        c22 = fluid.layers.leaky_relu(self.conv2a(c21), 0.1)
+        c22 = fluid.layers.leaky_relu(self.conv2aa(c22), 0.1)
+        c22 = fluid.layers.leaky_relu(self.conv2b(c22), 0.1)
+
+        c13 = fluid.layers.leaky_relu(self.conv3a(c12), 0.1)
+        c13 = fluid.layers.leaky_relu(self.conv3aa(c13), 0.1)
+        c13 = fluid.layers.leaky_relu(self.conv3b(c13), 0.1)
+
+        c23 = fluid.layers.leaky_relu(self.conv3a(c22), 0.1)
+        c23 = fluid.layers.leaky_relu(self.conv3aa(c23), 0.1)
+        c23 = fluid.layers.leaky_relu(self.conv3b(c23), 0.1)
+
+        c14 = fluid.layers.leaky_relu(self.conv4a(c13), 0.1)
+        c14 = fluid.layers.leaky_relu(self.conv4aa(c14), 0.1)
+        c14 = fluid.layers.leaky_relu(self.conv4b(c14), 0.1)
+
+        c24 = fluid.layers.leaky_relu(self.conv4a(c23), 0.1)
+        c24 = fluid.layers.leaky_relu(self.conv4aa(c24), 0.1)
+        c24 = fluid.layers.leaky_relu(self.conv4b(c24), 0.1)
+
+        c15 = fluid.layers.leaky_relu(self.conv5a(c14), 0.1)
+        c15 = fluid.layers.leaky_relu(self.conv5aa(c15), 0.1)
+        c15 = fluid.layers.leaky_relu(self.conv5b(c15), 0.1)
+
+        c25 = fluid.layers.leaky_relu(self.conv5a(c24), 0.1)
+        c25 = fluid.layers.leaky_relu(self.conv5aa(c25), 0.1)
+        c25 = fluid.layers.leaky_relu(self.conv5b(c25), 0.1)
+
+        c16 = fluid.layers.leaky_relu(self.conv6aa(c15), 0.1)
+        c16 = fluid.layers.leaky_relu(self.conv6a(c16), 0.1)
+        c16 = fluid.layers.leaky_relu(self.conv6b(c16), 0.1)
+
+        c26 = fluid.layers.leaky_relu(self.conv6aa(c25), 0.1)
+        c26 = fluid.layers.leaky_relu(self.conv6a(c26), 0.1)
+        c26 = fluid.layers.leaky_relu(self.conv6b(c26), 0.1)
+
+        corr6 = self.corr(c16, c26)
+        corr6 = fluid.layers.leaky_relu(corr6, alpha=0.1)
+
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv6_0(corr6), 0.1), corr6],
+            axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv6_1(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv6_2(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv6_3(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv6_4(x), 0.1), x], axis=1)
+
+        flow6 = self.predict_flow6(x)
+        up_flow6 = self.deconv6(flow6)
+        up_feat6 = self.upfeat6(x)
+
+        warp5 = self.warp(c25, up_flow6 * 0.625)
+        corr5 = self.corr(c15, warp5)
+        corr5 = fluid.layers.leaky_relu(corr5, alpha=0.1)
+
+        x = fluid.layers.concat(input=[corr5, c15, up_flow6, up_feat6], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv5_0(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv5_1(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv5_2(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv5_3(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv5_4(x), 0.1), x], axis=1)
+
+        flow5 = self.predict_flow5(x)
+        up_flow5 = self.deconv5(flow5)
+        up_feat5 = self.upfeat5(x)
+
+        warp4 = self.warp(c24, up_flow5 * 1.25)
+        corr4 = self.corr(c14, warp4)
+        corr4 = fluid.layers.leaky_relu(corr4, alpha=0.1)
+
+        x = fluid.layers.concat(input=[corr4, c14, up_flow5, up_feat5], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv4_0(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv4_1(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv4_2(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv4_3(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv4_4(x), 0.1), x], axis=1)
+
+        flow4 = self.predict_flow4(x)
+        up_flow4 = self.deconv4(flow4)
+        up_feat4 = self.upfeat4(x)
+
+        warp3 = self.warp(c23, up_flow4 * 2.5)
+        corr3 = self.corr(c13, warp3)
+        corr3 = fluid.layers.leaky_relu(corr3, alpha=0.1)
+
+        x = fluid.layers.concat(input=[corr3, c13, up_flow4, up_feat4], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv3_0(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv3_1(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv3_2(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv3_3(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv3_4(x), 0.1), x], axis=1)
+
+        flow3 = self.predict_flow3(x)
+        up_flow3 = self.deconv3(flow3)
+        up_feat3 = self.upfeat3(x)
+
+        warp2 = self.warp(c22, up_flow3 * 5.0)
+        corr2 = self.corr(c12, warp2)
+        corr2 = fluid.layers.leaky_relu(corr2, alpha=0.1)
+
+        x = fluid.layers.concat(input=[corr2, c12, up_flow3, up_feat3], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv2_0(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv2_1(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv2_2(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv2_3(x), 0.1), x], axis=1)
+        x = fluid.layers.concat(
+            input=[fluid.layers.leaky_relu(self.conv2_4(x), 0.1), x], axis=1)
+
+        flow2 = self.predict_flow2(x)
+
+        x = fluid.layers.leaky_relu(
+            self.dc_conv4(
+                fluid.layers.leaky_relu(
+                    self.dc_conv3(
+                        fluid.layers.leaky_relu(
+                            self.dc_conv2(
+                                fluid.layers.leaky_relu(self.dc_conv1(x), 0.1)),
+                            0.1)), 0.1)), 0.1)
+        flow2 += self.dc_conv7(
+            fluid.layers.leaky_relu(
+                self.dc_conv6(fluid.layers.leaky_relu(self.dc_conv5(x), 0.1)),
+                0.1))
+
+        if not output_more:
+            return flow2
+        else:
+            return [flow2, flow3, flow4, flow5, flow6]
+
+
+def pwc_dc_net(path=None):
+    model = PWCDCNet()
+    if path is not None:
+        import pickle
+        data = pickle.load(open(path, 'rb'))
+        weight_list = []
+        for k, v in data.items():
+            weight_list.append(v)
+        param_dict = {}
+        for i, param in enumerate(model.parameters()):
+            param_dict[param.name] = weight_list[i]
+        model.load_dict(param_dict)
+
+    return model
--- a/applications/DAIN/resblock/__init__.py
+++ b/applications/DAIN/resblock/__init__.py
+from .basicblock import *
--- a/applications/DAIN/resblock/basicblock.py
+++ b/applications/DAIN/resblock/basicblock.py
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Conv2D
+
+__all__ = ['MultipleBasicBlock', 'MultipleBasicBlock_4']
+
+
+def conv3x3(in_planes, out_planes, dilation=1, stride=1, param_attr=None):
+    return Conv2D(in_planes,
+                  out_planes,
+                  filter_size=3,
+                  stride=stride,
+                  padding=int(dilation * (3 - 1) / 2),
+                  dilation=dilation,
+                  bias_attr=False,
+                  param_attr=param_attr)
+
+
+class BasicBlock(fluid.dygraph.Layer):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, dilation=1, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+
+        param_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=1.0, seed=0))
+
+        self.conv1 = conv3x3(inplanes, planes, dilation, stride, param_attr)
+        self.conv2 = conv3x3(planes, planes, param_attr=param_attr)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        # out = self.bn1(out)
+        out = fluid.layers.relu(out)
+
+        out = self.conv2(out)
+        # out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = fluid.layers.relu(out)
+
+        return out
+
+
+class MultipleBasicBlock(fluid.dygraph.Layer):
+    def __init__(self,
+                 input_feature,
+                 block,
+                 num_blocks,
+                 intermediate_feature=64,
+                 dense=True):
+        super(MultipleBasicBlock, self).__init__()
+        self.dense = dense
+        self.num_block = num_blocks
+        self.intermediate_feature = intermediate_feature
+
+        param_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=1.0, seed=0))
+
+        self.block1 = Conv2D(input_feature,
+                             intermediate_feature,
+                             filter_size=7,
+                             stride=1,
+                             padding=3,
+                             bias_attr=True,
+                             param_attr=param_attr)
+
+        dim = intermediate_feature
+        self.block2 = block(dim, dim, dilation=1) if num_blocks >= 2 else None
+        self.block3 = block(dim, dim, dilation=1) if num_blocks >= 3 else None
+        self.block4 = block(dim, dim, dilation=1) if num_blocks >= 4 else None
+        self.block5 = Conv2D(dim, 3, 3, 1, 1)
+
+    def forward(self, x):
+        x = fluid.layers.relu(self.block1(x))
+        x = self.block2(x) if self.num_block >= 2 else x
+        x = self.block3(x) if self.num_block >= 3 else x
+        x = self.block4(x) if self.num_block >= 4 else x
+        x = self.block5(x)
+        return x
+
+
+def MultipleBasicBlock_4(input_feature, intermediate_feature=64):
+    model = MultipleBasicBlock(input_feature, BasicBlock, 4,
+                               intermediate_feature)
+    return model
--- a/applications/DAIN/run.sh
+++ b/applications/DAIN/run.sh
+cd pwcnet/correlation_op
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python -c 'import paddle; print(paddle.sysconfig.get_lib())'`
+export PYTHONPATH=$PYTHONPATH:`pwd`
+cd ../../
+
+VID_PATH=/paddle/work/github/DAIN/data/CBA.mp4
+OUT_PATH=output
+MODEL_PATH=DAIN_paddle_weight
+
+#CUDA_VISIBLE_DEVICES=1 python demo.py \
+#    --time_step 0.125 \
+#    --video_path=$VID_PATH \
+#    --output_path=$OUT_PATH \
+#    --saved_model=$MODEL_PATH
+
+CUDA_VISIBLE_DEVICES=2 python predict.py \
+    --time_step 0.125 \
+    --video_path=$VID_PATH \
+    --output_path=$OUT_PATH \
+    --saved_model=$MODEL_PATH
--- a/applications/DAIN/util.py
+++ b/applications/DAIN/util.py
+import os, sys
+import glob
+import shutil
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def dump_frames_ffmpeg(vid_path, outpath, r=None, ss=None, t=None):
+    ffmpeg = ['ffmpeg ', ' -loglevel ', ' error ']
+    vid_name = vid_path.split('/')[-1].split('.')[0]
+    out_full_path = os.path.join(outpath, vid_name)
+
+    if not os.path.exists(out_full_path):
+        os.makedirs(out_full_path)
+
+    # video file name
+    outformat = out_full_path + '/%08d.png'
+
+    if ss is not None and t is not None and r is not None:
+        cmd = ffmpeg + [
+            ' -ss ',
+            ss,
+            ' -t ',
+            t,
+            ' -i ',
+            vid_path,
+            ' -r ',
+            r,
+            # ' -f ', ' image2 ',
+            #                        ' -s ', ' 960*540 ',
+            ' -qscale:v ',
+            ' 0.1 ',
+            ' -start_number ',
+            ' 0 ',
+            # ' -qmax ', ' 1 ',
+            outformat
+        ]
+    else:
+        cmd = ffmpeg + [' -i ', vid_path, ' -start_number ', ' 0 ', outformat]
+
+    cmd = ''.join(cmd)
+    print(cmd)
+    if os.system(cmd) == 0:
+        print('Video: {} done'.format(vid_name))
+    else:
+        print('Video: {} error'.format(vid_name))
+    print('')
+    sys.stdout.flush()
+    return out_full_path
+
+
+def frames_to_video_ffmpeg(framepath, videopath, r):
+    ffmpeg = ['ffmpeg ', ' -loglevel ', ' error ']
+    cmd = ffmpeg + [
+        ' -r ', r, ' -f ', ' image2 ', ' -i ', framepath, ' -vcodec ',
+        ' libx264 ', ' -pix_fmt ', ' yuv420p ', ' -crf ', ' 16 ', videopath
+    ]
+    cmd = ''.join(cmd)
+    print(cmd)
+
+    if os.system(cmd) == 0:
+        print('Video: {} done'.format(videopath))
+    else:
+        print('Video: {} error'.format(videopath))
+    print('')
+    sys.stdout.flush()
+
+
+def combine_frames(input, interpolated, combined, num_frames):
+    frames1 = sorted(glob.glob(os.path.join(input, '*.png')))
+    frames2 = sorted(glob.glob(os.path.join(interpolated, '*.png')))
+    num1 = len(frames1)
+    num2 = len(frames2)
+    # assert (num1 - 1) * num_frames == num2
+    for i in range(num1):
+        src = frames1[i]
+        imgname = int(src.split('/')[-1].split('.')[-2])
+        assert i == imgname
+        dst = os.path.join(combined, '{:08d}.png'.format(i * (num_frames + 1)))
+        shutil.copy2(src, dst)
+        if i < num1 - 1:
+            for k in range(num_frames):
+                src = frames2[i * num_frames + k]
+                dst = os.path.join(
+                    combined, '{:08d}.png'.format(i * (num_frames + 1) + k + 1))
+                shutil.copy2(src, dst)