From e41aff14b3f0d6e63cb041a3811f907bc5d848dd Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Thu, 6 Aug 2020 06:57:44 +0000 Subject: [PATCH] DAIN for video frame interpolation --- .pre-commit-config.yaml | 2 +- applications/DAIN/demo.py | 233 +++++++ applications/DAIN/my_args.py | 94 +++ applications/DAIN/networks/__init__.py | 3 + applications/DAIN/networks/dain.py | 124 ++++ applications/DAIN/networks/dain_slowmotion.py | 148 +++++ applications/DAIN/predict.py | 277 ++++++++ applications/DAIN/pwcnet/__init__.py | 1 + .../DAIN/pwcnet/correlation_op/README.md | 13 + .../DAIN/pwcnet/correlation_op/correlation.py | 46 ++ .../pwcnet/correlation_op/correlation_op.cc | 137 ++++ .../pwcnet/correlation_op/correlation_op.cu | 434 +++++++++++++ .../DAIN/pwcnet/correlation_op/make.sh | 31 + .../pwcnet/correlation_op/test_correlation.py | 136 ++++ applications/DAIN/pwcnet/pwcnet.py | 591 ++++++++++++++++++ applications/DAIN/resblock/__init__.py | 1 + applications/DAIN/resblock/basicblock.py | 94 +++ applications/DAIN/run.sh | 20 + applications/DAIN/util.py | 102 +++ 19 files changed, 2486 insertions(+), 1 deletion(-) create mode 100644 applications/DAIN/demo.py create mode 100644 applications/DAIN/my_args.py create mode 100644 applications/DAIN/networks/__init__.py create mode 100644 applications/DAIN/networks/dain.py create mode 100644 applications/DAIN/networks/dain_slowmotion.py create mode 100644 applications/DAIN/predict.py create mode 100644 applications/DAIN/pwcnet/__init__.py create mode 100644 applications/DAIN/pwcnet/correlation_op/README.md create mode 100644 applications/DAIN/pwcnet/correlation_op/correlation.py create mode 100644 applications/DAIN/pwcnet/correlation_op/correlation_op.cc create mode 100644 applications/DAIN/pwcnet/correlation_op/correlation_op.cu create mode 100644 applications/DAIN/pwcnet/correlation_op/make.sh create mode 100644 applications/DAIN/pwcnet/correlation_op/test_correlation.py create mode 100644 applications/DAIN/pwcnet/pwcnet.py create mode 100644 applications/DAIN/resblock/__init__.py create mode 100644 applications/DAIN/resblock/basicblock.py create mode 100644 applications/DAIN/run.sh create mode 100644 applications/DAIN/util.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d8f849f..a8e5812 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ entry: python ./.copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ - exclude: (?!.*third_party)^.*$ \ No newline at end of file + exclude: (?!.*third_party)^.*$ diff --git a/applications/DAIN/demo.py b/applications/DAIN/demo.py new file mode 100644 index 0000000..6a4377c --- /dev/null +++ b/applications/DAIN/demo.py @@ -0,0 +1,233 @@ +import os, sys +import math +import random +import time +import glob +import shutil +import numpy as np +from imageio import imread, imsave +import cv2 + +import paddle.fluid as fluid + +import networks +from util import * +from my_args import args + +if __name__ == '__main__': + + DO_MiddleBurryOther = True + + video_path = args.video_path + output_path = args.output_path + frame_path_input = os.path.join(output_path, 'frames-input') + frame_path_interpolated = os.path.join(output_path, 'frames-interpolated') + frame_path_combined = os.path.join(output_path, 'frames-combined') + video_path_input = os.path.join(output_path, 'videos-input') + video_path_output = os.path.join(output_path, 'videos-output') + + if not os.path.exists(output_path): + os.makedirs(output_path) + if not os.path.exists(frame_path_input): + os.makedirs(frame_path_input) + if not os.path.exists(frame_path_interpolated): + os.makedirs(frame_path_interpolated) + if not os.path.exists(frame_path_combined): + os.makedirs(frame_path_combined) + if not os.path.exists(video_path_input): + os.makedirs(video_path_input) + if not os.path.exists(video_path_output): + os.makedirs(video_path_output) + + args.KEY_FRAME_THREAD = 0. + saved_model = args.saved_model + + timestep = args.time_step + num_frames = int(1.0 / timestep) - 1 + + image = fluid.data(name='image', + shape=[2, 1, args.channels, -1, -1], + dtype='float32') + DAIN = networks.__dict__["DAIN_slowmotion"](channel=args.channels, + filter_size=args.filter_size, + timestep=args.time_step, + training=False) + out = DAIN(image) + out = out[0][1] + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + fetch_list = [out.name] + + inference_program = fluid.default_main_program().clone(for_test=True) + inference_program = fluid.io.load_persistables(exe, saved_model, + inference_program) + + if not DO_MiddleBurryOther: + sys.exit() + + if video_path.endswith('.mp4'): + videos = [video_path] + else: + videos = sorted(glob.glob(os.path.join(video_path, '*.mp4'))) + for cnt, vid in enumerate(videos): + print("Interpolating video:", vid) + cap = cv2.VideoCapture(vid) + fps = cap.get(cv2.CAP_PROP_FPS) + print("Old fps (frame rate): ", fps) + + timestep = args.time_step + times_interp = int(1.0 / timestep) + r2 = str(int(fps) * times_interp) + + print("New fps (frame rate): ", r2) + + # set start and end of video + #ss = 0 + #t = 10 + #ss = time.strftime('%H:%M:%S', time.gmtime(ss)) + #t = time.strftime('%H:%M:%S', time.gmtime(t)) + #print(r, ss, t) + r = None + ss = None + t = None + + out_path = dump_frames_ffmpeg(vid, frame_path_input, r, ss, t) + + vidname = vid.split('/')[-1].split('.')[0] + + tot_timer = AverageMeter() + proc_timer = AverageMeter() + end = time.time() + + frames = sorted(glob.glob(os.path.join(out_path, '*.png'))) + + img = imread(frames[0]) + + int_width = img.shape[1] + int_height = img.shape[0] + channel = img.shape[2] + if not channel == 3: + continue + + if int_width != ((int_width >> 7) << 7): + int_width_pad = (((int_width >> 7) + 1) << 7) # more than necessary + padding_left = int((int_width_pad - int_width) / 2) + padding_right = int_width_pad - int_width - padding_left + else: + int_width_pad = int_width + padding_left = 32 + padding_right = 32 + + if int_height != ((int_height >> 7) << 7): + int_height_pad = ( + ((int_height >> 7) + 1) << 7) # more than necessary + padding_top = int((int_height_pad - int_height) / 2) + padding_bottom = int_height_pad - int_height - padding_top + else: + int_height_pad = int_height + padding_top = 32 + padding_bottom = 32 + + frame_num = len(frames) + print(os.path.join(frame_path_input, vidname, '*.png')) + print('processing {} frames, from video: {}'.format(frame_num, vid)) + + if not os.path.exists(os.path.join(frame_path_interpolated, vidname)): + os.makedirs(os.path.join(frame_path_interpolated, vidname)) + if not os.path.exists(os.path.join(frame_path_combined, vidname)): + os.makedirs(os.path.join(frame_path_combined, vidname)) + + for i in range(frame_num - 1): + print(frames[i]) + first = frames[i] + second = frames[i + 1] + + img_first = imread(first) + img_second = imread(second) + '''--------------Frame change test------------------------''' + img_first_gray = np.dot(img_first[..., :3], [0.299, 0.587, 0.114]) + img_second_gray = np.dot(img_second[..., :3], [0.299, 0.587, 0.114]) + + img_first_gray = img_first_gray.flatten(order='C') + img_second_gray = img_second_gray.flatten(order='C') + corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1] + key_frame = False + if corr < args.KEY_FRAME_THREAD: + key_frame = True + '''-------------------------------------------------------''' + + X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255 + X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255 + + if key_frame: + y_ = [ + np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0)) + for i in range(num_frames) + ] + else: + assert (X0.shape[1] == X1.shape[1]) + assert (X0.shape[2] == X1.shape[2]) + + print("size before padding ", X0.shape) + X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \ + (padding_left, padding_right)), mode='edge') + X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \ + (padding_left, padding_right)), mode='edge') + print("size after padding ", X0.shape) + + X0 = np.expand_dims(X0, axis=0) + X1 = np.expand_dims(X1, axis=0) + + X0 = np.expand_dims(X0, axis=0) + X1 = np.expand_dims(X1, axis=0) + + X = np.concatenate((X0, X1), axis=0) + + proc_end = time.time() + o = exe.run(inference_program, + fetch_list=fetch_list, + feed={"image": X}) + y_ = o[0] + + proc_timer.update(time.time() - proc_end) + tot_timer.update(time.time() - end) + end = time.time() + print("*******current image process time \t " + + str(time.time() - proc_end) + "s ******") + + y_ = [ + np.transpose( + 255.0 * item.clip( + 0, 1.0)[0, :, padding_top:padding_top + int_height, + padding_left:padding_left + int_width], + (1, 2, 0)) for item in y_ + ] + time_offsets = [ + kk * timestep for kk in range(1, 1 + num_frames, 1) + ] + + count = 1 + for item, time_offset in zip(y_, time_offsets): + out_dir = os.path.join( + frame_path_interpolated, vidname, + "{:0>4d}_{:0>4d}.png".format(i, count)) + count = count + 1 + imsave(out_dir, np.round(item).astype(np.uint8)) + + timestep = args.time_step + num_frames = int(1.0 / timestep) - 1 + + input_dir = os.path.join(frame_path_input, vidname) + interpolated_dir = os.path.join(frame_path_interpolated, vidname) + combined_dir = os.path.join(frame_path_combined, vidname) + combine_frames(input_dir, interpolated_dir, combined_dir, num_frames) + + frame_pattern_combined = os.path.join(frame_path_combined, vidname, + '%08d.png') + video_pattern_output = os.path.join(video_path_output, vidname + '.mp4') + if os.path.exists(video_pattern_output): + os.remove(video_pattern_output) + frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output, r2) diff --git a/applications/DAIN/my_args.py b/applications/DAIN/my_args.py new file mode 100644 index 0000000..ee4e5f4 --- /dev/null +++ b/applications/DAIN/my_args.py @@ -0,0 +1,94 @@ +import os +import datetime +import argparse +import numpy +import networks + +modelnames = networks.__all__ +# import datasets +datasetNames = ('Vimeo_90K_interp') #datasets.__all__ + +parser = argparse.ArgumentParser(description='DAIN') + +parser.add_argument('--debug', action='store_true', help='Enable debug mode') +parser.add_argument('--netName', + type=str, + default='DAIN', + choices=modelnames, + help='model architecture: ' + ' | '.join(modelnames) + + ' (default: DAIN)') + +parser.add_argument('--datasetName', + default='Vimeo_90K_interp', + choices=datasetNames, + nargs='+', + help='dataset type : ' + ' | '.join(datasetNames) + + ' (default: Vimeo_90K_interp)') +parser.add_argument('--video_path', + default='', + help='the path of selected videos') +parser.add_argument('--output_path', default='', help='the output root path') + +parser.add_argument('--seed', + type=int, + default=1, + help='random seed (default: 1)') + +parser.add_argument('--batch_size', + '-b', + type=int, + default=1, + help='batch size (default:1)') +parser.add_argument('--channels', + '-c', + type=int, + default=3, + choices=[1, 3], + help='channels of images (default:3)') +parser.add_argument('--filter_size', + '-f', + type=int, + default=4, + help='the size of filters used (default: 4)', + choices=[2, 4, 6, 5, 51]) + +parser.add_argument('--time_step', + type=float, + default=0.5, + help='choose the time steps') +parser.add_argument( + '--alpha', + type=float, + nargs='+', + default=[0.0, 1.0], + help= + 'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])' +) +parser.add_argument('--frame_rate', + type=int, + default=None, + help='frame rate of the input video') + +parser.add_argument('--patience', + type=int, + default=5, + help='the patience of reduce on plateou') +parser.add_argument('--factor', + type=float, + default=0.2, + help='the factor of reduce on plateou') + +parser.add_argument('--saved_model', + type=str, + default='', + help='path to the model weights') +parser.add_argument('--no-date', + action='store_true', + help='don\'t append date timestamp to folder') +parser.add_argument('--use_cuda', + default=True, + type=bool, + help='use cuda or not') +parser.add_argument('--use_cudnn', default=1, type=int, help='use cudnn or not') + +args = parser.parse_args() diff --git a/applications/DAIN/networks/__init__.py b/applications/DAIN/networks/__init__.py new file mode 100644 index 0000000..2462f96 --- /dev/null +++ b/applications/DAIN/networks/__init__.py @@ -0,0 +1,3 @@ +from .dain import DAIN +from .dain_slowmotion import DAIN_slowmotion +__all__ = ('DAIN', 'DAIN_slowmotion') diff --git a/applications/DAIN/networks/dain.py b/applications/DAIN/networks/dain.py new file mode 100644 index 0000000..8d51c9f --- /dev/null +++ b/applications/DAIN/networks/dain.py @@ -0,0 +1,124 @@ +import paddle.fluid as fluid +import resblock +import pwcnet + + +class DAIN(fluid.dygraph.Layer): + def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True): + # base class initialization + super(DAIN, self).__init__() + + self.filter_size = filter_size + self.training = training + self.timestep = timestep + assert (timestep == 0.5) + self.numFrames = int(1.0 / timestep) - 1 + + ctx_ch = 3 * 64 + 3 + inplanes = 3 + 3 + 3 + 2 * 1 + 2 * 2 + 16 * 2 + 2 * ctx_ch + + self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes, + 64) + self.flownets = pwcnet.__dict__['pwc_dc_net']() + self.div_flow = 20.0 + + def forward(self, input): + """ + Parameters + ---------- + input: shape (3, batch, 3, width, height) + ----------- + """ + losses = [] + offsets = [] + ''' + STEP 1: sequeeze the input + ''' + if self.training == True: + + assert input.shape[0] == 3 + input_0 = input[0] + input_1 = input[1] + input_2 = input[2] + else: + # print(input.shape[0]) + assert input.shape[0] == 2 + input_0 = input[0] + input_2 = input[1] + + #prepare the input data of current scale + cur_input_0 = input_0 + if self.training == True: + cur_input_1 = input_1 + cur_input_2 = input_2 + ''' + STEP 3.2: concatenating the inputs. + ''' + cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2], + axis=1) + ''' + STEP 3.3: perform the estimation + ''' + time_offsets = [ + kk * self.timestep for kk in range(1, 1 + self.numFrames, 1) + ] + + cur_offset_outputs = [ + self.forward_flownets(self.flownets, + cur_offset_input, + time_offsets=time_offsets), + self.forward_flownets(self.flownets, + fluid.layers.concat( + [cur_input_2, cur_input_0], axis=1), + time_offsets=time_offsets[::-1]) + ] + + cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]] + + # Warp image use warp-op in PWC-Net + ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0]) + ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1]) + cur_output = (ref0 + ref2) / 2.0 + + rectify_input = fluid.layers.concat([ + cur_output, ref0, ref2, cur_offset_output[0], cur_offset_output[1] + ], + axis=1) + + cur_output_rectified = self.rectifyNet(rectify_input) + cur_output + ''' + STEP 3.5: for training phase, we collect the variables to be penalized. + ''' + if self.training == True: + losses += [cur_output - cur_input_1] + losses += [cur_output_rectified - cur_input_1] + offsets += [cur_offset_output] + ''' + STEP 4: return the results + ''' + if self.training == True: + # if in the training phase, we output the losses to be minimized. + # return losses, loss_occlusion + return losses, offsets + else: + cur_outputs = [cur_output, cur_output_rectified] + return cur_outputs, cur_offset_output + + def forward_flownets(self, model, input, time_offsets=None): + + if time_offsets == None: + time_offsets = [0.5] + elif type(time_offsets) == float: + time_offsets = [time_offsets] + elif type(time_offsets) == list: + pass + # this is a single direction motion results, but not a bidirectional one + temp = model(input) + + # single direction to bidirection should haven it. + temps = [ + self.div_flow * temp * time_offset for time_offset in time_offsets + ] + # nearest interpolation won't be better i think + temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps] + return temps diff --git a/applications/DAIN/networks/dain_slowmotion.py b/applications/DAIN/networks/dain_slowmotion.py new file mode 100644 index 0000000..e21b3ee --- /dev/null +++ b/applications/DAIN/networks/dain_slowmotion.py @@ -0,0 +1,148 @@ +import paddle.fluid as fluid +import resblock +import time +import pwcnet + + +class DAIN_slowmotion(fluid.dygraph.Layer): + def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True): + # base class initialization + super(DAIN_slowmotion, self).__init__() + + self.filter_size = filter_size + self.training = training + self.timestep = timestep + self.num_frames = int(1.0 / timestep) - 1 + + ctx_ch = 3 * 64 + 3 + # inplanes = 3 + 3 + 3 + 2*1 + 2*2 + 2 + inplanes = 13 + + self.flownets = pwcnet.__dict__['pwc_dc_net']() + self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes, + 64) + self.div_flow = 20.0 + + def forward(self, input): + """ + Parameters + ---------- + input: shape (3, batch, 3, width, height) + ----------- + """ + losses = [] + offsets = [] + ''' + STEP 1: sequeeze the input + ''' + if self.training == True: + + assert input.shape[0] == 3 + input_0 = input[0] + input_1 = input[1] + input_2 = input[2] + else: + assert input.shape[0] == 2 + input_0 = input[0] + input_2 = input[1] + + #prepare the input data of current scale + cur_input_0 = input_0 + if self.training == True: + cur_input_1 = input_1 + cur_input_2 = input_2 + ''' + STEP 3.2: concatenating the inputs. + ''' + cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2], + axis=1) + ''' + STEP 3.3: perform the estimation + ''' + time_offsets = [ + kk * self.timestep for kk in range(1, 1 + self.num_frames, 1) + ] + + cur_offset_outputs = [ + self.forward_flownets(self.flownets, + cur_offset_input, + time_offsets=time_offsets), + self.forward_flownets(self.flownets, + fluid.layers.concat( + [cur_input_2, cur_input_0], axis=1), + time_offsets=time_offsets[::-1]) + ] + ''' + STEP 3.4: perform the frame interpolation process + ''' + count = 0 + for temp_0, temp_1, timeoffset in zip(cur_offset_outputs[0], + cur_offset_outputs[1], + time_offsets): + cur_offset_output = [temp_0, temp_1] + + ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0]) + ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1]) + cur_output_temp = (ref0 + ref2) / 2.0 + + if count == 0: + cur_output = fluid.layers.unsqueeze(cur_output_temp, axes=0) + else: + cur_output_ = fluid.layers.unsqueeze(cur_output_temp, axes=0) + cur_output = fluid.layers.concat([cur_output, cur_output_], + axis=0) + + rectify_input = fluid.layers.concat([ + cur_output_temp, ref0, ref2, cur_offset_output[0], + cur_offset_output[1] + ], + axis=1) + + cur_output_rectified_temp = self.rectifyNet( + rectify_input) + cur_output_temp + + if count == 0: + cur_output_rectified = fluid.layers.unsqueeze( + cur_output_rectified_temp, axes=0) + else: + cur_output_rectified_ = fluid.layers.unsqueeze( + cur_output_rectified_temp, axes=0) + cur_output_rectified = fluid.layers.concat( + [cur_output_rectified, cur_output_rectified_], axis=0) + + count += 1 + ''' + STEP 3.5: for training phase, we collect the variables to be penalized. + ''' + if self.training == True: + losses += [cur_output - cur_input_1] + losses += [cur_output_rectified - cur_input_1] + offsets += [cur_offset_output] + ''' + STEP 4: return the results + ''' + if self.training == True: + # if in the training phase, we output the losses to be minimized. + # return losses, loss_occlusion + return losses, offsets + else: + cur_outputs = [cur_output, cur_output_rectified] + return cur_outputs, cur_offset_output + + def forward_flownets(self, model, input, time_offsets=None): + if time_offsets == None: + time_offsets = [0.5] + elif type(time_offsets) == float: + time_offsets = [time_offsets] + elif type(time_offsets) == list: + pass + # this is a single direction motion results, but not a bidirectional one + temp = model(input) + + # single direction to bidirection should haven it. + temps = [ + self.div_flow * temp * time_offset for time_offset in time_offsets + ] + # nearest interpolation won't be better i think + temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps] + return temps diff --git a/applications/DAIN/predict.py b/applications/DAIN/predict.py new file mode 100644 index 0000000..3923906 --- /dev/null +++ b/applications/DAIN/predict.py @@ -0,0 +1,277 @@ +import os, sys +import math +import random +import time +import glob +import shutil +import numpy as np +from imageio import imread, imsave +import cv2 + +import paddle.fluid as fluid + +import networks +from util import * +from my_args import args + + +def infer_engine(model_dir, + run_mode='fluid', + batch_size=1, + use_gpu=False, + min_subgraph_size=3): + if not use_gpu and not run_mode == 'fluid': + raise ValueError( + "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}" + .format(run_mode, use_gpu)) + precision_map = { + 'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32, + 'trt_fp16': fluid.core.AnalysisConfig.Precision.Half + } + config = fluid.core.AnalysisConfig(os.path.join(model_dir, 'model'), + os.path.join(model_dir, 'params')) + if use_gpu: + # initial GPU memory(M), device ID + config.enable_use_gpu(100, 0) + # optimize graph and fuse op + config.switch_ir_optim(True) + else: + config.disable_gpu() + + if run_mode in precision_map.keys(): + config.enable_tensorrt_engine(workspace_size=1 << 10, + max_batch_size=batch_size, + min_subgraph_size=min_subgraph_size, + precision_mode=precision_map[run_mode], + use_static=False, + use_calib_mode=False) + + # disable print log when predict + config.disable_glog_info() + # enable shared memory + config.enable_memory_optim() + # disable feed, fetch OP, needed by zero_copy_run + config.switch_use_feed_fetch_ops(False) + predictor = fluid.core.create_paddle_predictor(config) + return predictor + + +def executor(model_dir, use_gpu=False): + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + program, feed_names, fetch_targets = fluid.io.load_inference_model( + dirname=model_dir, + executor=exe, + model_filename='model', + params_filename='params') + return exe, program, fetch_targets + + +class VideoFrameInterp(object): + def __init__(self, + time_step, + model_path, + video_path, + use_gpu=True, + key_frame_thread=0., + output_path='output'): + self.video_path = video_path + self.output_path = output_path + self.model_path = model_path + self.time_step = time_step + self.key_frame_thread = key_frame_thread + + self.exe, self.program, self.fetch_targets = executor(model_path, + use_gpu=use_gpu) + # self.predictor = load_predictor( + # model_dir, + # run_mode=run_mode, + # min_subgraph_size=3, + # use_gpu=use_gpu) + + def run(self): + frame_path_input = os.path.join(self.output_path, 'frames-input') + frame_path_interpolated = os.path.join(self.output_path, + 'frames-interpolated') + frame_path_combined = os.path.join(self.output_path, 'frames-combined') + video_path_output = os.path.join(self.output_path, 'videos-output') + + if not os.path.exists(self.output_path): + os.makedirs(self.output_path) + if not os.path.exists(frame_path_input): + os.makedirs(frame_path_input) + if not os.path.exists(frame_path_interpolated): + os.makedirs(frame_path_interpolated) + if not os.path.exists(frame_path_combined): + os.makedirs(frame_path_combined) + if not os.path.exists(video_path_output): + os.makedirs(video_path_output) + + timestep = self.time_step + num_frames = int(1.0 / timestep) - 1 + + if self.video_path.endswith('.mp4'): + videos = [self.video_path] + else: + videos = sorted(glob.glob(os.path.join(self.video_path, '*.mp4'))) + + for cnt, vid in enumerate(videos): + print("Interpolating video:", vid) + cap = cv2.VideoCapture(vid) + fps = cap.get(cv2.CAP_PROP_FPS) + print("Old fps (frame rate): ", fps) + + times_interp = int(1.0 / timestep) + r2 = str(int(fps) * times_interp) + print("New fps (frame rate): ", r2) + + out_path = dump_frames_ffmpeg(vid, frame_path_input) + + vidname = vid.split('/')[-1].split('.')[0] + + tot_timer = AverageMeter() + proc_timer = AverageMeter() + end = time.time() + + frames = sorted(glob.glob(os.path.join(out_path, '*.png'))) + + img = imread(frames[0]) + + int_width = img.shape[1] + int_height = img.shape[0] + channel = img.shape[2] + if not channel == 3: + continue + + if int_width != ((int_width >> 7) << 7): + int_width_pad = ( + ((int_width >> 7) + 1) << 7) # more than necessary + padding_left = int((int_width_pad - int_width) / 2) + padding_right = int_width_pad - int_width - padding_left + else: + int_width_pad = int_width + padding_left = 32 + padding_right = 32 + + if int_height != ((int_height >> 7) << 7): + int_height_pad = ( + ((int_height >> 7) + 1) << 7) # more than necessary + padding_top = int((int_height_pad - int_height) / 2) + padding_bottom = int_height_pad - int_height - padding_top + else: + int_height_pad = int_height + padding_top = 32 + padding_bottom = 32 + + frame_num = len(frames) + print('processing {} frames, from video: {}'.format(frame_num, vid)) + + if not os.path.exists(os.path.join(frame_path_interpolated, + vidname)): + os.makedirs(os.path.join(frame_path_interpolated, vidname)) + if not os.path.exists(os.path.join(frame_path_combined, vidname)): + os.makedirs(os.path.join(frame_path_combined, vidname)) + + for i in range(frame_num - 1): + print(frames[i]) + first = frames[i] + second = frames[i + 1] + + img_first = imread(first) + img_second = imread(second) + '''--------------Frame change test------------------------''' + img_first_gray = np.dot(img_first[..., :3], + [0.299, 0.587, 0.114]) + img_second_gray = np.dot(img_second[..., :3], + [0.299, 0.587, 0.114]) + + img_first_gray = img_first_gray.flatten(order='C') + img_second_gray = img_second_gray.flatten(order='C') + corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1] + key_frame = False + if corr < self.key_frame_thread: + key_frame = True + '''-------------------------------------------------------''' + + X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255 + X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255 + + if key_frame: + y_ = [ + np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0)) + for i in range(num_frames) + ] + else: + assert (X0.shape[1] == X1.shape[1]) + assert (X0.shape[2] == X1.shape[2]) + + print("size before padding ", X0.shape) + X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \ + (padding_left, padding_right)), mode='edge') + X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \ + (padding_left, padding_right)), mode='edge') + print("size after padding ", X0.shape) + + X0 = np.expand_dims(X0, axis=0) + X1 = np.expand_dims(X1, axis=0) + + X0 = np.expand_dims(X0, axis=0) + X1 = np.expand_dims(X1, axis=0) + + X = np.concatenate((X0, X1), axis=0) + + proc_end = time.time() + o = self.exe.run(self.program, + fetch_list=self.fetch_targets, + feed={"image": X}) + + y_ = o[0] + + proc_timer.update(time.time() - proc_end) + tot_timer.update(time.time() - end) + end = time.time() + print("*********** current image process time \t " + + str(time.time() - proc_end) + "s *********") + + y_ = [ + np.transpose( + 255.0 * item.clip( + 0, 1.0)[0, :, + padding_top:padding_top + int_height, + padding_left:padding_left + int_width], + (1, 2, 0)) for item in y_ + ] + time_offsets = [ + kk * timestep for kk in range(1, 1 + num_frames, 1) + ] + + count = 1 + for item, time_offset in zip(y_, time_offsets): + out_dir = os.path.join( + frame_path_interpolated, vidname, + "{:0>4d}_{:0>4d}.png".format(i, count)) + count = count + 1 + imsave(out_dir, np.round(item).astype(np.uint8)) + + num_frames = int(1.0 / timestep) - 1 + + input_dir = os.path.join(frame_path_input, vidname) + interpolated_dir = os.path.join(frame_path_interpolated, vidname) + combined_dir = os.path.join(frame_path_combined, vidname) + combine_frames(input_dir, interpolated_dir, combined_dir, + num_frames) + + frame_pattern_combined = os.path.join(frame_path_combined, vidname, + '%08d.png') + video_pattern_output = os.path.join(video_path_output, + vidname + '.mp4') + if os.path.exists(video_pattern_output): + os.remove(video_pattern_output) + frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output, + r2) + + +if __name__ == '__main__': + predictor = VideoFrameInterp(args.time_step, args.saved_model, + args.video_path, args.output_path) + predictor.run() diff --git a/applications/DAIN/pwcnet/__init__.py b/applications/DAIN/pwcnet/__init__.py new file mode 100644 index 0000000..bf51e1d --- /dev/null +++ b/applications/DAIN/pwcnet/__init__.py @@ -0,0 +1 @@ +from .pwcnet import * diff --git a/applications/DAIN/pwcnet/correlation_op/README.md b/applications/DAIN/pwcnet/correlation_op/README.md new file mode 100644 index 0000000..d413853 --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/README.md @@ -0,0 +1,13 @@ +自定义OP编译: +2. sh make.sh编译成correlation_lib.so动态库 +3. 添加动态库路径到LD_LIBRARY_PATH: +``` +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python3.7 -c 'import paddle; print(paddle.sysconfig.get_lib())'` +``` +4. 添加correlation op的python路径: +``` +export PYTHONPATH=$PYTHONPATH:`pwd` +``` +5. python test_correlation.py运行单测,验证是否加载成功。 + +PS: 如果paddle whl包是从官网上下载的,需要使用gcc 4.8,即把make.sh中的g++ 改为 g++-4.8 diff --git a/applications/DAIN/pwcnet/correlation_op/correlation.py b/applications/DAIN/pwcnet/correlation_op/correlation.py new file mode 100644 index 0000000..6a75d6a --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/correlation.py @@ -0,0 +1,46 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle.fluid as fluid +import os +file_dir = os.path.dirname(os.path.abspath(__file__)) +fluid.load_op_library(os.path.join(file_dir, 'correlation_lib.so')) + +from paddle.fluid.layer_helper import LayerHelper + + +def correlation(input1, + input2, + pad_size, + kernel_size, + max_displacement, + stride1, + stride2, + corr_type_multiply=1): + helper = LayerHelper("correlation", **locals()) + output = helper.create_variable_for_type_inference(dtype=input1.dtype) + helper.append_op(type="correlation", + inputs={ + "Input1": input1, + "Input2": input2 + }, + attrs={ + "pad_size": pad_size, + "kernel_size": kernel_size, + "max_displacement": max_displacement, + "stride1": stride1, + "stride2": stride2, + "corr_type_multiply": corr_type_multiply + }, + outputs={"Output": output}) + return output diff --git a/applications/DAIN/pwcnet/correlation_op/correlation_op.cc b/applications/DAIN/pwcnet/correlation_op/correlation_op.cc new file mode 100644 index 0000000..0c638cf --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/correlation_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +inline std::vector CorrelationOutputSize(int batch, int input_height, int input_width, int stride1, int stride2, int kernel_size, int pad_size, int max_displacement) { + + std::vector output_shape({batch}); + int kernel_radius = (kernel_size - 1) / 2; + int border_radius = kernel_radius + max_displacement; + int padded_input_height = input_height + 2 * pad_size; + int padded_input_width = input_width + 2 * pad_size; + int output_channel = ((max_displacement/stride2) * 2 + 1) * ((max_displacement/stride2) * 2 + 1); + output_shape.push_back(output_channel); + int output_height = std::ceil(static_cast(padded_input_height - 2 * border_radius) / static_cast(stride1)); + int output_width = std::ceil(static_cast(padded_input_width - 2 * border_radius) / static_cast(stride1)); + output_shape.push_back(output_height); + output_shape.push_back(output_width); + return output_shape; +} + +class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override{ + AddInput("Input1", "input1"); + AddInput("Input2", "input2"); + AddOutput("Output", "output"); + AddAttr("pad_size", "pad size for input1 and input2"); + AddAttr("kernel_size", "kernel size of input1 and input2"); + AddAttr("max_displacement", "max displacement of input1 and input2"); + AddAttr("stride1", "Input1 stride"); + AddAttr("stride2", "Input2 stride"); + AddAttr("corr_type_multiply", "correlation coefficient").SetDefault(1); + AddComment(R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC"); + } +}; + +class CorrelationOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override{ + PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(input1) cannot be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(input2) cannot be null"); + int stride1 = ctx->Attrs().Get("stride1"); + int stride2 = ctx->Attrs().Get("stride2"); + int max_displacement = ctx->Attrs().Get("max_displacement"); + int pad_size = ctx->Attrs().Get("pad_size"); + int kernel_size = ctx->Attrs().Get("kernel_size"); + + auto in_dims = ctx->GetInputDim("Input1"); + auto in2_dims = ctx->GetInputDim("Input2"); + PADDLE_ENFORCE_EQ(in_dims.size() == 4, true, "input1 must be 4-dims"); + PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true, "input2 must be 4-dims"); + std::vector output_shape = CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1, stride2, kernel_size, pad_size, max_displacement); + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override{ + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input1"); + PADDLE_ENFORCE_EQ(input_data_type, ctx.Input("Input2")->type(), "Input1 and Input2 shoule have same type"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +template +class CorrelationOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("correlation_grad"); + op->SetInput("Input1", this->Input("Input1")); + op->SetInput("Input2", this->Input("Input2")); + op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output")); + op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1")); + op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2")); + op->SetAttrMap(this->Attrs()); + } +}; + +class CorrelationOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override{ + PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(Input1) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(Input2) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Output")), true, "Input(Output@GRAD) should not be null"); + + auto in1_dims = ctx->GetInputDim("Input1"); + auto in2_dims = ctx->GetInputDim("Input2"); + ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims); + ctx->SetOutputDim(framework::GradVarName("Input2"), in1_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override{ + const auto* var = ctx.InputVar(framework::GradVarName("Output")); + if (var == nullptr) { + PADDLE_THROW("cannot find Output@GRAD"); + } + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker, + ops::CorrelationOpGradMaker, + ops::CorrelationOpGradMaker); +REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad); diff --git a/applications/DAIN/pwcnet/correlation_op/correlation_op.cu b/applications/DAIN/pwcnet/correlation_op/correlation_op.cu new file mode 100644 index 0000000..1618444 --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/correlation_op.cu @@ -0,0 +1,434 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +#define THREADS_PER_BLOCK 32 +#define FULL_MASK 0xffffffff + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +__forceinline__ __device__ T warpReduceSum(T val) { + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_down_sync(FULL_MASK, val, offset); + } + return val; +} + +template +__forceinline__ __device__ T blockReduceSum(T val) { + static __shared__ T shared[32]; + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) + shared[wid] = val; + + __syncthreads(); + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + + if (wid == 0) + val = warpReduceSum(val); + + return val; +} + +template +__global__ void set_zero(T *x, int num) { + for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) + x[i] = static_cast(0); +} + +template +__global__ void channel_first(const T *input, T *rinput, const int channel, const int height, const int width, const int pad_size) { + int n = blockIdx.x; + int h = blockIdx.y; + int w = blockIdx.z; + + int ch_off = threadIdx.x; + T value; + int dimchw = channel * height * width; + int dimhw = height * width; + + int p_dimw = (width + 2 * pad_size); + int p_dimh = (height + 2 * pad_size); + int p_dimchw = channel * p_dimw * p_dimh; + int p_dimcw = channel * p_dimw; + + for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) { + value = input[n * dimchw + c * dimhw + h * width + w]; + rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel + c] = value; + } +} + +template +__global__ void correlation_forward(T *output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int input_channel, const int input_height, const int input_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) { + + int p_input_width = input_width + 2 * pad_size; + int p_input_height = input_height + 2 * pad_size; + + int kernel_rad = (kernel_size - 1) / 2; + int displacement_rad = max_displacement / stride2; + + int displacement_size = 2 * displacement_rad + 1; + + int n = blockIdx.x; + int h1 = blockIdx.y * stride1 + max_displacement; + int w1 = blockIdx.z * stride1 + max_displacement; + int c = threadIdx.x; + + int p_dimchw = p_input_height * p_input_width * input_channel; + int p_dimcw = p_input_width * input_channel; + int p_dimc = input_channel; + + int t_dimchw = output_channel * output_height * output_width; + int t_dimhw = output_height * output_width; + int t_dimw = output_width; + + int nelems = kernel_size * kernel_size * p_dimc; + + for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) { + for(int ti = -displacement_rad; ti <= displacement_rad; ++ti) { + int w2 = w1 + ti * stride2; + int h2 = h1 + tj * stride2; + + T acc0 = 0; + for(int j = -kernel_rad; j <= kernel_rad; ++j) { + for(int i = -kernel_rad; i <= kernel_rad; ++i) { + for(int ch = c; ch < p_dimc; ch += blockDim.x) { + int index1 = n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch; + int index2 = n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch; + acc0 += static_cast(rinput1[index1] * rinput2[index2]); + } + } + } + if (blockDim.x == warpSize) { + __syncwarp(); + acc0 = warpReduceSum(acc0); + } else { + __syncthreads(); + acc0 = blockReduceSum(acc0); + } + + if (threadIdx.x == 0) { + int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad); + const int t_index = n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z; + output[t_index] = static_cast(acc0 / nelems); + } + } + } + +} + +//class CorrelationKernel +template +class CorrelationKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must be CUDAPlace"); + + auto *input1 = ctx.Input("Input1"); + auto *input2 = ctx.Input("Input2"); + int pad_size = ctx.Attr("pad_size"); + int kernel_size = ctx.Attr("kernel_size"); + int stride1 = ctx.Attr("stride1"); + int stride2 = ctx.Attr("stride2"); + int max_displacement = ctx.Attr("max_displacement"); + int corr_type_multiply = ctx.Attr("corr_type_multiply"); + + auto *output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.template device_context(); + + // base on input1, NCHW + auto in_dims = input1->dims(); + int N = in_dims[0]; + int C = in_dims[1]; + int H = in_dims[2]; + int W = in_dims[3]; + + int padded_input_height = H + 2 * pad_size; + int padded_input_width = W + 2 * pad_size; + + Tensor rinput1 = ctx.AllocateTmpTensor({N, padded_input_height, padded_input_width, C}, dev_ctx); + rinput1.mutable_data(ctx.GetPlace()); + + Tensor rinput2 = ctx.AllocateTmpTensor({N, padded_input_height, padded_input_width, C}, dev_ctx); + rinput2.mutable_data(ctx.GetPlace()); + + set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data(), rinput1.numel()); + set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data(), rinput2.numel()); + set_zero<<<(output->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(output->data(), output->numel()); + + auto out_dims = output->dims(); + int OC = out_dims[1]; + int OH = out_dims[2]; + int OW = out_dims[3]; + + dim3 blocks_grid(N, H, W); + dim3 threads_block(THREADS_PER_BLOCK); + + channel_first<<>>(input1->data(), rinput1.data(), C, H, W, pad_size); + channel_first<<>>(input2->data(), rinput2.data(), C, H, W, pad_size); + + dim3 threadsPerBlock(THREADS_PER_BLOCK); + dim3 totalBlocksCorr(N, OH, OW); + + correlation_forward<<>>(output->data(), OC, OH, OW, rinput1.data(), +C, H, W, rinput2.data(), pad_size, kernel_size, max_displacement, stride1, stride2); + } +}; + +template +__global__ void correlation_backward_input1(int item, T *grad_input1, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) { + + int n = item; + int h = blockIdx.x * stride1 + pad_size; + int w = blockIdx.y * stride1 + pad_size; + int c = blockIdx.z; + int tch_off = threadIdx.x; + + int kernel_rad = (kernel_size - 1) / 2; + int displacement_rad = max_displacement / stride2; + int displacement_size = 2 * displacement_rad + 1; + + int xmin = (w - kernel_rad - max_displacement) / stride1; + int ymin = (h - kernel_rad - max_displacement) / stride1; + + int xmax = (w + kernel_rad - max_displacement) / stride1; + int ymax = (h + kernel_rad - max_displacement) / stride1; + + if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) { + return; + } + + if (xmin > xmax || ymin > ymax) { + return; + } + + xmin = max(0, xmin); + xmax = min(output_width - 1, xmax); + + ymin = max(0, ymin); + ymax = min(output_height - 1, ymax); + + int p_input_width = input_width + 2 * pad_size; + int p_input_height = input_height + 2 * pad_size; + int p_dimchw = input_channel * p_input_height * p_input_width; + int p_dimcw = input_channel * p_input_width; + int p_dimc = input_channel; + + int t_dimchw = output_channel * output_height * output_width; + int t_dimhw = output_height * output_width; + int t_dimw = output_width; + + int o_dimchw = input_channel * input_height * input_width; + int o_dimhw = input_height * input_width; + int o_dimw = input_width; + + int nelems = kernel_size * kernel_size * input_channel; + + __shared__ T prod_sum[THREADS_PER_BLOCK]; + prod_sum[tch_off] = 0; + + for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) { + int i2 = (tc % displacement_size - displacement_rad) * stride2; + int j2 = (tc / displacement_size - displacement_rad) * stride2; + + int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c; + + T val2 = rinput2[index2]; + for (int j = ymin; j <= ymax; ++j) { + for (int i = xmin; i <= xmax; ++i) { + int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i; + prod_sum[tch_off] += grad_output[t_index] * val2; + } + } + } + + __syncthreads(); + + if (tch_off == 0) { + T reduce_sum = 0; + for (int index = 0; index < THREADS_PER_BLOCK; index++) { + reduce_sum += prod_sum[index]; + } + const int index1 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size); + grad_input1[index1] = static_cast(reduce_sum / nelems); + } + +} + +template +__global__ void correlation_backward_input2(int item, T *grad_input2, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2){ + + int n = item; + int h = blockIdx.x * stride1 + pad_size; + int w = blockIdx.y * stride1 + pad_size; + int c = blockIdx.z; + + int tch_off = threadIdx.x; + + int kernel_rad = (kernel_size - 1) / 2; + int displacement_rad = max_displacement / stride2; + int displacement_size = 2 * displacement_rad + 1; + + int p_input_width = input_width + 2 * pad_size; + int p_input_height = input_height + 2 * pad_size; + int p_dimchw = input_channel * p_input_height * p_input_width; + int p_dimcw = input_channel * p_input_width; + int p_dimc = input_channel; + + int t_dimchw = output_channel * output_height * output_width; + int t_dimhw = output_height * output_width; + int t_dimw = output_width; + + int o_dimchw = input_channel * input_height * input_width; + int o_dimhw = input_height * input_width; + int o_dimw = input_width; + + int nelems = kernel_size * kernel_size * input_channel; + + __shared__ T prod_sum[THREADS_PER_BLOCK]; + prod_sum[tch_off] = 0; + + for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) { + int i2 = (tc % displacement_size - displacement_rad) * stride2; + int j2 = (tc / displacement_size - displacement_rad) * stride2; + + int xmin = (w - kernel_rad - max_displacement - i2) / stride1; + int ymin = (h - kernel_rad - max_displacement - j2) / stride1; + + int xmax = (w + kernel_rad - max_displacement - i2) / stride1; + int ymax = (h + kernel_rad - max_displacement - j2) / stride1; + + if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) { + continue; + } + + if (xmin > xmax || ymin > ymax) { + continue; + } + + xmin = max(0, xmin); + xmax = min(output_width - 1, xmax); + + ymin = max(0, ymin); + ymax = min(output_height - 1, ymax); + + int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c; + T val1 = rinput1[index1]; + for (int j = ymin; j <= ymax; ++j) { + for (int i = xmin; i <= xmax; ++i) { + int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i; + prod_sum[tch_off] += grad_output[t_index] * val1; + } + } + } + + __syncthreads(); + + if (tch_off == 0) { + T reduce_sum = 0; + for (int index = 0; index < THREADS_PER_BLOCK; index++) { + reduce_sum += prod_sum[index]; + } + const int index2 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size); + grad_input2[index2] = static_cast(reduce_sum / nelems); + } +} + +template +class CorrelationGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must use CUDAPlace."); + const auto *input1 = ctx.Input("Input1"); + const auto *input2 = ctx.Input("Input2"); + const auto *grad_output = ctx.Input(framework::GradVarName("Output")); + const int pad_size = ctx.Attr("pad_size"); + const int kernel_size = ctx.Attr("kernel_size"); + const int stride1 = ctx.Attr("stride1"); + const int stride2 = ctx.Attr("stride2"); + const int max_displacement = ctx.Attr("max_displacement"); + const int corr_type_multiply = ctx.Attr("corr_type_multiply"); + + auto *grad_input1 = ctx.Output(framework::GradVarName("Input1")); + grad_input1->mutable_data(ctx.GetPlace()); + auto *grad_input2 = ctx.Output(framework::GradVarName("Input2")); + grad_input2->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.template device_context(); + + auto in_dims = input1->dims(); + int N = in_dims[0]; + int C = in_dims[1]; + int H = in_dims[2]; + int W = in_dims[3]; + + int padded_input_height = H + 2 * pad_size; + int padded_input_width = W + 2 * pad_size; + + Tensor rinput1 = ctx.AllocateTmpTensor({N, padded_input_height, padded_input_width, C}, dev_ctx); + rinput1.mutable_data(ctx.GetPlace()); + + Tensor rinput2 = ctx.AllocateTmpTensor({N, padded_input_height, padded_input_width, C}, dev_ctx); + rinput2.mutable_data(ctx.GetPlace()); + + set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data(), rinput1.numel()); + set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data(), rinput2.numel()); + set_zero<<<(grad_input1->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input1->data(), grad_input1->numel()); + set_zero<<<(grad_input2->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input2->data(), grad_input2->numel()); + + auto grad_out_dims = grad_output->dims(); + int GOC = grad_out_dims[1]; + int GOH = grad_out_dims[2]; + int GOW = grad_out_dims[3]; + + dim3 blocks_grid(N, H, W); + dim3 threads_block(THREADS_PER_BLOCK); + + channel_first<<>>(input1->data(), rinput1.data(), C, H, W, pad_size); + channel_first<<>>(input2->data(), rinput2.data(), C, H, W, pad_size); + + dim3 threadsPerBlock(THREADS_PER_BLOCK); + dim3 totalBlocksCorr(H, W, C); + + for (int n = 0; n < N; n++) { + correlation_backward_input1<<>>(n, grad_input1->data(), C, H, W, grad_output->data(), GOC, GOH, GOW, rinput2.data(), pad_size, kernel_size, max_displacement, stride1, stride2); + } + + for (int n = 0; n < N; n++) { + correlation_backward_input2<<>>(n, grad_input2->data(), C, H, W, grad_output->data(), GOC, GOH, GOW, rinput1.data(), pad_size, kernel_size, max_displacement, stride1, stride2); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + correlation, ops::CorrelationKernel, + ops::CorrelationKernel); +REGISTER_OP_CUDA_KERNEL( + correlation_grad, ops::CorrelationGradKernel, + ops::CorrelationGradKernel); + diff --git a/applications/DAIN/pwcnet/correlation_op/make.sh b/applications/DAIN/pwcnet/correlation_op/make.sh new file mode 100644 index 0000000..1aad7eb --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/make.sh @@ -0,0 +1,31 @@ +# source /ssd1/vis/liufanglong/.bashrc +#export PATH=/home/work/cuda-9.0/bin:$PATH +#export PATH=/home/work/cuda-9.0/bin:$PATH +#export LD_LIBRARY_PATH="/home/work/cuda-9.0/lib64:$LD_LIBRARY_PATH" +#export LD_LIBRARY_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/lib64:$LD_LIBRARY_PATH +#export CPLUS_INCLUDE_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/include:/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/include:$CPLUS_INCLUDE_PATH +#export LD_LIBRARY_PATH=/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/lib:$LD_LIBRARY_PATH + +include_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_include())' ) +lib_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_lib())' ) + +echo $include_dir +echo $lib_dir + +OPS='correlation_op' +for op in ${OPS} +do +nvcc ${op}.cu -c -o ${op}.cu.o -ccbin cc -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO -DPADDLE_WITH_MKLDNN -Xcompiler -fPIC -std=c++11 -Xcompiler -fPIC -w --expt-relaxed-constexpr -O0 -g -DNVCC \ + -I ${include_dir}/third_party/ \ + -I ${include_dir} +done + +# g++-4.8 correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \ +# g++ ${OPS}.cu.o ${OPS}.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \ +g++ correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \ + -I ${include_dir}/third_party/ \ + -I ${include_dir} \ + -L ${lib_dir} \ + -L /usr/local/cuda/lib64/ -lpaddle_framework -lcudart + +# rm *.cu.o diff --git a/applications/DAIN/pwcnet/correlation_op/test_correlation.py b/applications/DAIN/pwcnet/correlation_op/test_correlation.py new file mode 100644 index 0000000..6372b14 --- /dev/null +++ b/applications/DAIN/pwcnet/correlation_op/test_correlation.py @@ -0,0 +1,136 @@ +import unittest +from correlation import correlation +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.base import to_variable + + +def corr(x_1, + x_2, + pad_size=4, + kernel_size=1, + max_displacement=4, + stride1=1, + stride2=1, + corr_multiply=1): + K = kernel_size + # rinput1 = np.pad(x_1, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3) + # rinput2 = np.pad(x_2, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3) + + rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size), + (pad_size, pad_size)), + mode='constant') + rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size), + (pad_size, pad_size)), + mode='constant') + rinput1 = np.transpose(rinput1, (0, 2, 3, 1)) + rinput2 = np.transpose(rinput2, (0, 2, 3, 1)) + B = int(rinput1.shape[0]) + H = int(x_1.shape[2]) + W = int(x_2.shape[3]) + d = max_displacement + D = 2 * d + 1 + output = np.zeros((B, D * D, H, W), dtype=np.float32) + + for b in range(B): + for i in range(H): + for j in range(W): + for k in range(-d, d + 1): + for l in range(-d, d + 1): + x1_index = i + pad_size + y1_index = j + pad_size + x2_index = x1_index + k + y2_index = y1_index + l + output[b, l + d + D * (k + d), i, + j] = np.mean(rinput1[b, x1_index:x1_index + K, + y1_index:y1_index + K] * + rinput2[b, x2_index:x2_index + K, + y2_index:y2_index + K]) + + return output + + +class TestCorrelationOp(unittest.TestCase): + def test_check_output(self): + #x_shape = (1, 196, 3, 3) + np.random.seed(13) + np.set_printoptions(threshold=np.inf) + x_shape = (2, 10, 3, 3) + x_type = 'float32' + x1 = fluid.layers.data(name='x1', + shape=x_shape, + dtype=x_type, + append_batch_size=False) + x2 = fluid.layers.data(name='x2', + shape=x_shape, + dtype=x_type, + append_batch_size=False) + + x1_np = np.random.randn(2, 3, 4, 5).astype(x_type) + x2_np = np.random.randn(2, 3, 4, 5).astype(x_type) + out_np = corr(x1_np, + x2_np, + pad_size=4, + kernel_size=1, + max_displacement=4, + stride1=1, + stride2=1) + + out = correlation(x1, + x2, + pad_size=4, + kernel_size=1, + max_displacement=4, + stride1=1, + stride2=1) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + res = exe.run(feed={'x1': x1_np, 'x2': x2_np}, fetch_list=[out.name]) + + self.assertTrue(np.allclose(res[0], out_np)) + + +class Net(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Net, self).__init__(name_scope) + + def forward(self, x1, x2): + y = correlation(x1, + x2, + pad_size=4, + kernel_size=1, + max_displacement=4, + stride1=1, + stride2=1) + return y + + +class TestCorrelationOpDyGraph(unittest.TestCase): + def test_check_output(self): + np.random.seed(13) + np.set_printoptions(threshold=np.inf) + x_shape = (2, 10, 3, 3) + x_type = 'float32' + place = fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + x1_np = np.random.randn(2, 3, 4, 5).astype(x_type) + x2_np = np.random.randn(2, 3, 4, 5).astype(x_type) + out_np = corr(x1_np, + x2_np, + pad_size=4, + kernel_size=1, + max_displacement=4, + stride1=1, + stride2=1) + + x1 = to_variable(x1_np) + x2 = to_variable(x2_np) + corr_pd = Net('corr_pd') + y = corr_pd(x1, x2) + out = y.numpy() + self.assertTrue(np.allclose(out, out_np)) + + +if __name__ == '__main__': + unittest.main() diff --git a/applications/DAIN/pwcnet/pwcnet.py b/applications/DAIN/pwcnet/pwcnet.py new file mode 100644 index 0000000..75bd7e4 --- /dev/null +++ b/applications/DAIN/pwcnet/pwcnet.py @@ -0,0 +1,591 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import Conv2D, Conv2DTranspose + +from .correlation_op.correlation import correlation + +__all__ = ['pwc_dc_net'] + + +class PWCDCNet(fluid.dygraph.Layer): + def __init__(self, md=4): + super(PWCDCNet, self).__init__() + self.md = md + self.param_attr = fluid.ParamAttr( + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0004), + initializer=fluid.initializer.MSRAInitializer(uniform=True, + fan_in=None, + seed=0)) + self.conv1a = Conv2D(3, 16, 3, 2, 1, param_attr=self.param_attr) + self.conv1aa = Conv2D(16, 16, 3, 1, 1, param_attr=self.param_attr) + self.conv1b = Conv2D(16, 16, 3, 1, 1, param_attr=self.param_attr) + self.conv2a = Conv2D(16, 32, 3, 2, 1, param_attr=self.param_attr) + self.conv2aa = Conv2D(32, 32, 3, 1, 1, param_attr=self.param_attr) + self.conv2b = Conv2D(32, 32, 3, 1, 1, param_attr=self.param_attr) + self.conv3a = Conv2D(32, 64, 3, 2, 1, param_attr=self.param_attr) + self.conv3aa = Conv2D(64, 64, 3, 1, 1, param_attr=self.param_attr) + self.conv3b = Conv2D(64, 64, 3, 1, 1, param_attr=self.param_attr) + self.conv4a = Conv2D(64, 96, 3, 2, 1, param_attr=self.param_attr) + self.conv4aa = Conv2D(96, 96, 3, 1, 1, param_attr=self.param_attr) + self.conv4b = Conv2D(96, 96, 3, 1, 1, param_attr=self.param_attr) + self.conv5a = Conv2D(96, 128, 3, 2, 1, param_attr=self.param_attr) + self.conv5aa = Conv2D(128, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv5b = Conv2D(128, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv6aa = Conv2D(128, 196, 3, 2, 1, param_attr=self.param_attr) + self.conv6a = Conv2D(196, 196, 3, 1, 1, param_attr=self.param_attr) + self.conv6b = Conv2D(196, 196, 3, 1, 1, param_attr=self.param_attr) + + nd = (2 * self.md + 1)**2 + dd = np.cumsum([128, 128, 96, 64, 32], dtype=np.int32).astype(np.int) + dd = [int(d) for d in dd] + od = nd + self.conv6_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv6_1 = Conv2D(od + dd[0], + 128, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv6_2 = Conv2D(od + dd[1], + 96, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv6_3 = Conv2D(od + dd[2], + 64, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv6_4 = Conv2D(od + dd[3], + 32, + 3, + 1, + 1, + param_attr=self.param_attr) + self.predict_flow6 = Conv2D(od + dd[4], + 2, + 3, + 1, + 1, + param_attr=self.param_attr) + self.deconv6 = Conv2DTranspose(2, + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + self.upfeat6 = Conv2DTranspose(od + dd[4], + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + + od = nd + 128 + 4 + self.conv5_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv5_1 = Conv2D(od + dd[0], + 128, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv5_2 = Conv2D(od + dd[1], + 96, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv5_3 = Conv2D(od + dd[2], + 64, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv5_4 = Conv2D(od + dd[3], + 32, + 3, + 1, + 1, + param_attr=self.param_attr) + self.predict_flow5 = Conv2D(od + dd[4], + 2, + 3, + 1, + 1, + param_attr=self.param_attr) + self.deconv5 = Conv2DTranspose(2, + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + self.upfeat5 = Conv2DTranspose(od + dd[4], + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + + od = nd + 96 + 4 + self.conv4_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv4_1 = Conv2D(od + dd[0], + 128, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv4_2 = Conv2D(od + dd[1], + 96, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv4_3 = Conv2D(od + dd[2], + 64, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv4_4 = Conv2D(od + dd[3], + 32, + 3, + 1, + 1, + param_attr=self.param_attr) + self.predict_flow4 = Conv2D(od + dd[4], + 2, + 3, + 1, + 1, + param_attr=self.param_attr) + self.deconv4 = Conv2DTranspose(2, + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + self.upfeat4 = Conv2DTranspose(od + dd[4], + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + + od = nd + 64 + 4 + self.conv3_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv3_1 = Conv2D(od + dd[0], + 128, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv3_2 = Conv2D(od + dd[1], + 96, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv3_3 = Conv2D(od + dd[2], + 64, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv3_4 = Conv2D(od + dd[3], + 32, + 3, + 1, + 1, + param_attr=self.param_attr) + self.predict_flow3 = Conv2D(od + dd[4], + 2, + 3, + 1, + 1, + param_attr=self.param_attr) + self.deconv3 = Conv2DTranspose(2, + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + self.upfeat3 = Conv2DTranspose(od + dd[4], + 2, + 4, + stride=2, + padding=1, + param_attr=self.param_attr) + + od = nd + 32 + 4 + self.conv2_0 = Conv2D(od, 128, 3, 1, 1, param_attr=self.param_attr) + self.conv2_1 = Conv2D(od + dd[0], + 128, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv2_2 = Conv2D(od + dd[1], + 96, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv2_3 = Conv2D(od + dd[2], + 64, + 3, + 1, + 1, + param_attr=self.param_attr) + self.conv2_4 = Conv2D(od + dd[3], + 32, + 3, + 1, + 1, + param_attr=self.param_attr) + self.predict_flow2 = Conv2D(od + dd[4], + 2, + 3, + 1, + 1, + param_attr=self.param_attr) + # self.deconv2 = Conv2DTranspose(2, 2, 4, stride=2, padding=1, param_attr=self.param_attr) + + self.dc_conv1 = Conv2D(od + dd[4], + 128, + 3, + 1, + 1, + dilation=1, + param_attr=self.param_attr) + self.dc_conv2 = Conv2D(128, + 128, + 3, + 1, + 2, + dilation=2, + param_attr=self.param_attr) + self.dc_conv3 = Conv2D(128, + 128, + 3, + 1, + 4, + dilation=4, + param_attr=self.param_attr) + self.dc_conv4 = Conv2D(128, + 96, + 3, + 1, + 8, + dilation=8, + param_attr=self.param_attr) + self.dc_conv5 = Conv2D(96, + 64, + 3, + 1, + 16, + dilation=16, + param_attr=self.param_attr) + self.dc_conv6 = Conv2D(64, + 32, + 3, + 1, + 1, + dilation=1, + param_attr=self.param_attr) + self.dc_conv7 = Conv2D(32, 2, 3, 1, 1, param_attr=self.param_attr) + + def warp(self, x, flo): + """ + warp an image/tensor (im2) back to im1, according to the optical flow + x: [B, C, H, W] (im2) + flo: [B, 2, H, W] flow + """ + x_shape = fluid.layers.shape(x) + B, H, W = x_shape[0], x_shape[2], x_shape[3] + bb = fluid.layers.range(0, B, 1, 'float32') + xx = fluid.layers.range(0, W, 1, 'float32') + yy = fluid.layers.range(0, H, 1, 'float32') + _, yy, xx = paddle.tensor.meshgrid(bb, yy, xx) + yy = fluid.layers.unsqueeze(yy, [1]) + xx = fluid.layers.unsqueeze(xx, [1]) + grid = fluid.layers.concat(input=[xx, yy], axis=1) + flo = flo + vgrid = fluid.layers.elementwise_add(grid, flo) + + vgrid_0 = 2.0 * fluid.layers.slice( + vgrid, axes=[1], starts=[0], ends=[1]) / (W - 1.) - 1.0 + vgrid_1 = 2.0 * fluid.layers.slice( + vgrid, axes=[1], starts=[1], ends=[2]) / (H - 1.) - 1.0 + + vgrid = fluid.layers.concat(input=[vgrid_0, vgrid_1], axis=1) + vgrid = fluid.layers.transpose(vgrid, [0, 2, 3, 1]) + output = fluid.layers.grid_sampler(name='grid_sample', x=x, grid=vgrid) + + mask = fluid.layers.zeros_like(x) + mask = mask + 1.0 + mask = fluid.layers.grid_sampler(name='grid_sample', x=mask, grid=vgrid) + mask_temp1 = fluid.layers.cast(mask < 0.9990, 'float32') + mask = mask * (1 - mask_temp1) + mask = fluid.layers.cast(mask > 0, 'float32') + outwarp = fluid.layers.elementwise_mul(output, mask) + + return outwarp + + def warp_nomask(self, x, flo): + """ + warp an image/tensor (im2) back to im1, according to the optical flow + x: [B, C, H, W] (im2) + flo: [B, 2, H, W] flow + """ + + B, C, H, W = x.shape + # mesh grid + # xx = fluid.layers.range(0, W, 1, 'float32') + # xx = fluid.layers.reshape(xx, shape=[1, -1]) + # xx = fluid.layers.expand(x=xx, expand_times=[H, 1]) + # xx = fluid.layers.reshape(xx, shape=[1, 1, H, W]) + # xx = fluid.layers.expand(x=xx, expand_times=[B, 1, 1, 1]) + # + # yy = fluid.layers.range(0, H, 1, 'float32') + # yy = fluid.layers.reshape(yy, shape=[-1, 1]) + # yy = fluid.layers.expand(x=yy, expand_times=[1, W]) + # yy = fluid.layers.reshape(x=yy, shape=[1, 1, H, W]) + # yy = fluid.layers.expand(x=yy, expand_times=[B, 1, 1, 1]) + + x_shape = fluid.layers.shape(x) + B, H, W = x_shape[0], x_shape[2], x_shape[3] + bb = fluid.layers.range(0, B, 1, 'float32') + xx = fluid.layers.range(0, W, 1, 'float32') + # xx = fluid.layers.reshape(xx, shape=[1, -1]) + yy = fluid.layers.range(0, H, 1, 'float32') + # yy = fluid.layers.reshape(yy, shape=[1, -1]) + _, yy, xx = paddle.tensor.meshgrid(bb, yy, xx) + yy = fluid.layers.unsqueeze(yy, [1]) + xx = fluid.layers.unsqueeze(xx, [1]) + + grid = fluid.layers.concat(input=[xx, yy], axis=1) + flo = flo + vgrid = fluid.layers.elementwise_add(grid, flo) + #vgrid_0 = 2.0 * fluid.layers.slice(vgrid, axes=[1], starts=[0], ends=[1]) / max(W - 1, 1) - 1.0 + #vgrid_1 = 2.0 * fluid.layers.slice(vgrid, axes=[1], starts=[1], ends=[2]) / max(H - 1, 1) - 1.0 + vgrid_0 = 2.0 * fluid.layers.slice( + vgrid, axes=[1], starts=[0], ends=[1]) / (W - 1.) - 1.0 + vgrid_1 = 2.0 * fluid.layers.slice( + vgrid, axes=[1], starts=[1], ends=[2]) / (H - 1.) - 1.0 + vgrid = fluid.layers.concat(input=[vgrid_0, vgrid_1], axis=1) + vgrid = fluid.layers.transpose(vgrid, [0, 2, 3, 1]) + output = fluid.layers.grid_sampler(name='grid_sample', x=x, grid=vgrid) + + return output + + def corr(self, x_1, x_2): + out = correlation(x_1, + x_2, + pad_size=self.md, + kernel_size=1, + max_displacement=self.md, + stride1=1, + stride2=1, + corr_type_multiply=1) + return out + + def forward(self, x, output_more=False): + im1 = fluid.layers.slice(x, axes=[1], starts=[0], ends=[3]) + im2 = fluid.layers.slice(x, axes=[1], starts=[3], ends=[6]) + # print("\n\n********************PWC Net details *************** \n\n") + c11 = fluid.layers.leaky_relu(self.conv1a(im1), 0.1) + c11 = fluid.layers.leaky_relu(self.conv1aa(c11), 0.1) + c11 = fluid.layers.leaky_relu(self.conv1b(c11), 0.1) + + c21 = fluid.layers.leaky_relu(self.conv1a(im2), 0.1) + c21 = fluid.layers.leaky_relu(self.conv1aa(c21), 0.1) + c21 = fluid.layers.leaky_relu(self.conv1b(c21), 0.1) + c12 = fluid.layers.leaky_relu(self.conv2a(c11), 0.1) + c12 = fluid.layers.leaky_relu(self.conv2aa(c12), 0.1) + c12 = fluid.layers.leaky_relu(self.conv2b(c12), 0.1) + + c22 = fluid.layers.leaky_relu(self.conv2a(c21), 0.1) + c22 = fluid.layers.leaky_relu(self.conv2aa(c22), 0.1) + c22 = fluid.layers.leaky_relu(self.conv2b(c22), 0.1) + + c13 = fluid.layers.leaky_relu(self.conv3a(c12), 0.1) + c13 = fluid.layers.leaky_relu(self.conv3aa(c13), 0.1) + c13 = fluid.layers.leaky_relu(self.conv3b(c13), 0.1) + + c23 = fluid.layers.leaky_relu(self.conv3a(c22), 0.1) + c23 = fluid.layers.leaky_relu(self.conv3aa(c23), 0.1) + c23 = fluid.layers.leaky_relu(self.conv3b(c23), 0.1) + + c14 = fluid.layers.leaky_relu(self.conv4a(c13), 0.1) + c14 = fluid.layers.leaky_relu(self.conv4aa(c14), 0.1) + c14 = fluid.layers.leaky_relu(self.conv4b(c14), 0.1) + + c24 = fluid.layers.leaky_relu(self.conv4a(c23), 0.1) + c24 = fluid.layers.leaky_relu(self.conv4aa(c24), 0.1) + c24 = fluid.layers.leaky_relu(self.conv4b(c24), 0.1) + + c15 = fluid.layers.leaky_relu(self.conv5a(c14), 0.1) + c15 = fluid.layers.leaky_relu(self.conv5aa(c15), 0.1) + c15 = fluid.layers.leaky_relu(self.conv5b(c15), 0.1) + + c25 = fluid.layers.leaky_relu(self.conv5a(c24), 0.1) + c25 = fluid.layers.leaky_relu(self.conv5aa(c25), 0.1) + c25 = fluid.layers.leaky_relu(self.conv5b(c25), 0.1) + + c16 = fluid.layers.leaky_relu(self.conv6aa(c15), 0.1) + c16 = fluid.layers.leaky_relu(self.conv6a(c16), 0.1) + c16 = fluid.layers.leaky_relu(self.conv6b(c16), 0.1) + + c26 = fluid.layers.leaky_relu(self.conv6aa(c25), 0.1) + c26 = fluid.layers.leaky_relu(self.conv6a(c26), 0.1) + c26 = fluid.layers.leaky_relu(self.conv6b(c26), 0.1) + + corr6 = self.corr(c16, c26) + corr6 = fluid.layers.leaky_relu(corr6, alpha=0.1) + + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv6_0(corr6), 0.1), corr6], + axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv6_1(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv6_2(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv6_3(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv6_4(x), 0.1), x], axis=1) + + flow6 = self.predict_flow6(x) + up_flow6 = self.deconv6(flow6) + up_feat6 = self.upfeat6(x) + + warp5 = self.warp(c25, up_flow6 * 0.625) + corr5 = self.corr(c15, warp5) + corr5 = fluid.layers.leaky_relu(corr5, alpha=0.1) + + x = fluid.layers.concat(input=[corr5, c15, up_flow6, up_feat6], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv5_0(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv5_1(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv5_2(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv5_3(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv5_4(x), 0.1), x], axis=1) + + flow5 = self.predict_flow5(x) + up_flow5 = self.deconv5(flow5) + up_feat5 = self.upfeat5(x) + + warp4 = self.warp(c24, up_flow5 * 1.25) + corr4 = self.corr(c14, warp4) + corr4 = fluid.layers.leaky_relu(corr4, alpha=0.1) + + x = fluid.layers.concat(input=[corr4, c14, up_flow5, up_feat5], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv4_0(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv4_1(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv4_2(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv4_3(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv4_4(x), 0.1), x], axis=1) + + flow4 = self.predict_flow4(x) + up_flow4 = self.deconv4(flow4) + up_feat4 = self.upfeat4(x) + + warp3 = self.warp(c23, up_flow4 * 2.5) + corr3 = self.corr(c13, warp3) + corr3 = fluid.layers.leaky_relu(corr3, alpha=0.1) + + x = fluid.layers.concat(input=[corr3, c13, up_flow4, up_feat4], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv3_0(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv3_1(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv3_2(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv3_3(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv3_4(x), 0.1), x], axis=1) + + flow3 = self.predict_flow3(x) + up_flow3 = self.deconv3(flow3) + up_feat3 = self.upfeat3(x) + + warp2 = self.warp(c22, up_flow3 * 5.0) + corr2 = self.corr(c12, warp2) + corr2 = fluid.layers.leaky_relu(corr2, alpha=0.1) + + x = fluid.layers.concat(input=[corr2, c12, up_flow3, up_feat3], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv2_0(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv2_1(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv2_2(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv2_3(x), 0.1), x], axis=1) + x = fluid.layers.concat( + input=[fluid.layers.leaky_relu(self.conv2_4(x), 0.1), x], axis=1) + + flow2 = self.predict_flow2(x) + + x = fluid.layers.leaky_relu( + self.dc_conv4( + fluid.layers.leaky_relu( + self.dc_conv3( + fluid.layers.leaky_relu( + self.dc_conv2( + fluid.layers.leaky_relu(self.dc_conv1(x), 0.1)), + 0.1)), 0.1)), 0.1) + flow2 += self.dc_conv7( + fluid.layers.leaky_relu( + self.dc_conv6(fluid.layers.leaky_relu(self.dc_conv5(x), 0.1)), + 0.1)) + + if not output_more: + return flow2 + else: + return [flow2, flow3, flow4, flow5, flow6] + + +def pwc_dc_net(path=None): + model = PWCDCNet() + if path is not None: + import pickle + data = pickle.load(open(path, 'rb')) + weight_list = [] + for k, v in data.items(): + weight_list.append(v) + param_dict = {} + for i, param in enumerate(model.parameters()): + param_dict[param.name] = weight_list[i] + model.load_dict(param_dict) + + return model diff --git a/applications/DAIN/resblock/__init__.py b/applications/DAIN/resblock/__init__.py new file mode 100644 index 0000000..f11f7b5 --- /dev/null +++ b/applications/DAIN/resblock/__init__.py @@ -0,0 +1 @@ +from .basicblock import * diff --git a/applications/DAIN/resblock/basicblock.py b/applications/DAIN/resblock/basicblock.py new file mode 100644 index 0000000..a8333a4 --- /dev/null +++ b/applications/DAIN/resblock/basicblock.py @@ -0,0 +1,94 @@ +import paddle.fluid as fluid +from paddle.fluid.dygraph import Conv2D + +__all__ = ['MultipleBasicBlock', 'MultipleBasicBlock_4'] + + +def conv3x3(in_planes, out_planes, dilation=1, stride=1, param_attr=None): + return Conv2D(in_planes, + out_planes, + filter_size=3, + stride=stride, + padding=int(dilation * (3 - 1) / 2), + dilation=dilation, + bias_attr=False, + param_attr=param_attr) + + +class BasicBlock(fluid.dygraph.Layer): + expansion = 1 + + def __init__(self, inplanes, planes, dilation=1, stride=1, downsample=None): + super(BasicBlock, self).__init__() + + param_attr = fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=1.0, seed=0)) + + self.conv1 = conv3x3(inplanes, planes, dilation, stride, param_attr) + self.conv2 = conv3x3(planes, planes, param_attr=param_attr) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + # out = self.bn1(out) + out = fluid.layers.relu(out) + + out = self.conv2(out) + # out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = fluid.layers.relu(out) + + return out + + +class MultipleBasicBlock(fluid.dygraph.Layer): + def __init__(self, + input_feature, + block, + num_blocks, + intermediate_feature=64, + dense=True): + super(MultipleBasicBlock, self).__init__() + self.dense = dense + self.num_block = num_blocks + self.intermediate_feature = intermediate_feature + + param_attr = fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=1.0, seed=0)) + + self.block1 = Conv2D(input_feature, + intermediate_feature, + filter_size=7, + stride=1, + padding=3, + bias_attr=True, + param_attr=param_attr) + + dim = intermediate_feature + self.block2 = block(dim, dim, dilation=1) if num_blocks >= 2 else None + self.block3 = block(dim, dim, dilation=1) if num_blocks >= 3 else None + self.block4 = block(dim, dim, dilation=1) if num_blocks >= 4 else None + self.block5 = Conv2D(dim, 3, 3, 1, 1) + + def forward(self, x): + x = fluid.layers.relu(self.block1(x)) + x = self.block2(x) if self.num_block >= 2 else x + x = self.block3(x) if self.num_block >= 3 else x + x = self.block4(x) if self.num_block >= 4 else x + x = self.block5(x) + return x + + +def MultipleBasicBlock_4(input_feature, intermediate_feature=64): + model = MultipleBasicBlock(input_feature, BasicBlock, 4, + intermediate_feature) + return model diff --git a/applications/DAIN/run.sh b/applications/DAIN/run.sh new file mode 100644 index 0000000..aaf96ec --- /dev/null +++ b/applications/DAIN/run.sh @@ -0,0 +1,20 @@ +cd pwcnet/correlation_op +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python -c 'import paddle; print(paddle.sysconfig.get_lib())'` +export PYTHONPATH=$PYTHONPATH:`pwd` +cd ../../ + +VID_PATH=/paddle/work/github/DAIN/data/CBA.mp4 +OUT_PATH=output +MODEL_PATH=DAIN_paddle_weight + +#CUDA_VISIBLE_DEVICES=1 python demo.py \ +# --time_step 0.125 \ +# --video_path=$VID_PATH \ +# --output_path=$OUT_PATH \ +# --saved_model=$MODEL_PATH + +CUDA_VISIBLE_DEVICES=2 python predict.py \ + --time_step 0.125 \ + --video_path=$VID_PATH \ + --output_path=$OUT_PATH \ + --saved_model=$MODEL_PATH diff --git a/applications/DAIN/util.py b/applications/DAIN/util.py new file mode 100644 index 0000000..de6e18c --- /dev/null +++ b/applications/DAIN/util.py @@ -0,0 +1,102 @@ +import os, sys +import glob +import shutil + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def dump_frames_ffmpeg(vid_path, outpath, r=None, ss=None, t=None): + ffmpeg = ['ffmpeg ', ' -loglevel ', ' error '] + vid_name = vid_path.split('/')[-1].split('.')[0] + out_full_path = os.path.join(outpath, vid_name) + + if not os.path.exists(out_full_path): + os.makedirs(out_full_path) + + # video file name + outformat = out_full_path + '/%08d.png' + + if ss is not None and t is not None and r is not None: + cmd = ffmpeg + [ + ' -ss ', + ss, + ' -t ', + t, + ' -i ', + vid_path, + ' -r ', + r, + # ' -f ', ' image2 ', + # ' -s ', ' 960*540 ', + ' -qscale:v ', + ' 0.1 ', + ' -start_number ', + ' 0 ', + # ' -qmax ', ' 1 ', + outformat + ] + else: + cmd = ffmpeg + [' -i ', vid_path, ' -start_number ', ' 0 ', outformat] + + cmd = ''.join(cmd) + print(cmd) + if os.system(cmd) == 0: + print('Video: {} done'.format(vid_name)) + else: + print('Video: {} error'.format(vid_name)) + print('') + sys.stdout.flush() + return out_full_path + + +def frames_to_video_ffmpeg(framepath, videopath, r): + ffmpeg = ['ffmpeg ', ' -loglevel ', ' error '] + cmd = ffmpeg + [ + ' -r ', r, ' -f ', ' image2 ', ' -i ', framepath, ' -vcodec ', + ' libx264 ', ' -pix_fmt ', ' yuv420p ', ' -crf ', ' 16 ', videopath + ] + cmd = ''.join(cmd) + print(cmd) + + if os.system(cmd) == 0: + print('Video: {} done'.format(videopath)) + else: + print('Video: {} error'.format(videopath)) + print('') + sys.stdout.flush() + + +def combine_frames(input, interpolated, combined, num_frames): + frames1 = sorted(glob.glob(os.path.join(input, '*.png'))) + frames2 = sorted(glob.glob(os.path.join(interpolated, '*.png'))) + num1 = len(frames1) + num2 = len(frames2) + # assert (num1 - 1) * num_frames == num2 + for i in range(num1): + src = frames1[i] + imgname = int(src.split('/')[-1].split('.')[-2]) + assert i == imgname + dst = os.path.join(combined, '{:08d}.png'.format(i * (num_frames + 1))) + shutil.copy2(src, dst) + if i < num1 - 1: + for k in range(num_frames): + src = frames2[i * num_frames + k] + dst = os.path.join( + combined, '{:08d}.png'.format(i * (num_frames + 1) + k + 1)) + shutil.copy2(src, dst) -- GitLab