提交 e41aff14 编写于 作者: Q qingqing01

DAIN for video frame interpolation

上级 c56dbd8f
import os, sys
import math
import random
import time
import glob
import shutil
import numpy as np
from imageio import imread, imsave
import cv2
import paddle.fluid as fluid
import networks
from util import *
from my_args import args
if __name__ == '__main__':
DO_MiddleBurryOther = True
video_path = args.video_path
output_path = args.output_path
frame_path_input = os.path.join(output_path, 'frames-input')
frame_path_interpolated = os.path.join(output_path, 'frames-interpolated')
frame_path_combined = os.path.join(output_path, 'frames-combined')
video_path_input = os.path.join(output_path, 'videos-input')
video_path_output = os.path.join(output_path, 'videos-output')
if not os.path.exists(output_path):
os.makedirs(output_path)
if not os.path.exists(frame_path_input):
os.makedirs(frame_path_input)
if not os.path.exists(frame_path_interpolated):
os.makedirs(frame_path_interpolated)
if not os.path.exists(frame_path_combined):
os.makedirs(frame_path_combined)
if not os.path.exists(video_path_input):
os.makedirs(video_path_input)
if not os.path.exists(video_path_output):
os.makedirs(video_path_output)
args.KEY_FRAME_THREAD = 0.
saved_model = args.saved_model
timestep = args.time_step
num_frames = int(1.0 / timestep) - 1
image = fluid.data(name='image',
shape=[2, 1, args.channels, -1, -1],
dtype='float32')
DAIN = networks.__dict__["DAIN_slowmotion"](channel=args.channels,
filter_size=args.filter_size,
timestep=args.time_step,
training=False)
out = DAIN(image)
out = out[0][1]
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
fetch_list = [out.name]
inference_program = fluid.default_main_program().clone(for_test=True)
inference_program = fluid.io.load_persistables(exe, saved_model,
inference_program)
if not DO_MiddleBurryOther:
sys.exit()
if video_path.endswith('.mp4'):
videos = [video_path]
else:
videos = sorted(glob.glob(os.path.join(video_path, '*.mp4')))
for cnt, vid in enumerate(videos):
print("Interpolating video:", vid)
cap = cv2.VideoCapture(vid)
fps = cap.get(cv2.CAP_PROP_FPS)
print("Old fps (frame rate): ", fps)
timestep = args.time_step
times_interp = int(1.0 / timestep)
r2 = str(int(fps) * times_interp)
print("New fps (frame rate): ", r2)
# set start and end of video
#ss = 0
#t = 10
#ss = time.strftime('%H:%M:%S', time.gmtime(ss))
#t = time.strftime('%H:%M:%S', time.gmtime(t))
#print(r, ss, t)
r = None
ss = None
t = None
out_path = dump_frames_ffmpeg(vid, frame_path_input, r, ss, t)
vidname = vid.split('/')[-1].split('.')[0]
tot_timer = AverageMeter()
proc_timer = AverageMeter()
end = time.time()
frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
img = imread(frames[0])
int_width = img.shape[1]
int_height = img.shape[0]
channel = img.shape[2]
if not channel == 3:
continue
if int_width != ((int_width >> 7) << 7):
int_width_pad = (((int_width >> 7) + 1) << 7) # more than necessary
padding_left = int((int_width_pad - int_width) / 2)
padding_right = int_width_pad - int_width - padding_left
else:
int_width_pad = int_width
padding_left = 32
padding_right = 32
if int_height != ((int_height >> 7) << 7):
int_height_pad = (
((int_height >> 7) + 1) << 7) # more than necessary
padding_top = int((int_height_pad - int_height) / 2)
padding_bottom = int_height_pad - int_height - padding_top
else:
int_height_pad = int_height
padding_top = 32
padding_bottom = 32
frame_num = len(frames)
print(os.path.join(frame_path_input, vidname, '*.png'))
print('processing {} frames, from video: {}'.format(frame_num, vid))
if not os.path.exists(os.path.join(frame_path_interpolated, vidname)):
os.makedirs(os.path.join(frame_path_interpolated, vidname))
if not os.path.exists(os.path.join(frame_path_combined, vidname)):
os.makedirs(os.path.join(frame_path_combined, vidname))
for i in range(frame_num - 1):
print(frames[i])
first = frames[i]
second = frames[i + 1]
img_first = imread(first)
img_second = imread(second)
'''--------------Frame change test------------------------'''
img_first_gray = np.dot(img_first[..., :3], [0.299, 0.587, 0.114])
img_second_gray = np.dot(img_second[..., :3], [0.299, 0.587, 0.114])
img_first_gray = img_first_gray.flatten(order='C')
img_second_gray = img_second_gray.flatten(order='C')
corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1]
key_frame = False
if corr < args.KEY_FRAME_THREAD:
key_frame = True
'''-------------------------------------------------------'''
X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255
X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255
if key_frame:
y_ = [
np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0))
for i in range(num_frames)
]
else:
assert (X0.shape[1] == X1.shape[1])
assert (X0.shape[2] == X1.shape[2])
print("size before padding ", X0.shape)
X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \
(padding_left, padding_right)), mode='edge')
X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \
(padding_left, padding_right)), mode='edge')
print("size after padding ", X0.shape)
X0 = np.expand_dims(X0, axis=0)
X1 = np.expand_dims(X1, axis=0)
X0 = np.expand_dims(X0, axis=0)
X1 = np.expand_dims(X1, axis=0)
X = np.concatenate((X0, X1), axis=0)
proc_end = time.time()
o = exe.run(inference_program,
fetch_list=fetch_list,
feed={"image": X})
y_ = o[0]
proc_timer.update(time.time() - proc_end)
tot_timer.update(time.time() - end)
end = time.time()
print("*******current image process time \t " +
str(time.time() - proc_end) + "s ******")
y_ = [
np.transpose(
255.0 * item.clip(
0, 1.0)[0, :, padding_top:padding_top + int_height,
padding_left:padding_left + int_width],
(1, 2, 0)) for item in y_
]
time_offsets = [
kk * timestep for kk in range(1, 1 + num_frames, 1)
]
count = 1
for item, time_offset in zip(y_, time_offsets):
out_dir = os.path.join(
frame_path_interpolated, vidname,
"{:0>4d}_{:0>4d}.png".format(i, count))
count = count + 1
imsave(out_dir, np.round(item).astype(np.uint8))
timestep = args.time_step
num_frames = int(1.0 / timestep) - 1
input_dir = os.path.join(frame_path_input, vidname)
interpolated_dir = os.path.join(frame_path_interpolated, vidname)
combined_dir = os.path.join(frame_path_combined, vidname)
combine_frames(input_dir, interpolated_dir, combined_dir, num_frames)
frame_pattern_combined = os.path.join(frame_path_combined, vidname,
'%08d.png')
video_pattern_output = os.path.join(video_path_output, vidname + '.mp4')
if os.path.exists(video_pattern_output):
os.remove(video_pattern_output)
frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output, r2)
import os
import datetime
import argparse
import numpy
import networks
modelnames = networks.__all__
# import datasets
datasetNames = ('Vimeo_90K_interp') #datasets.__all__
parser = argparse.ArgumentParser(description='DAIN')
parser.add_argument('--debug', action='store_true', help='Enable debug mode')
parser.add_argument('--netName',
type=str,
default='DAIN',
choices=modelnames,
help='model architecture: ' + ' | '.join(modelnames) +
' (default: DAIN)')
parser.add_argument('--datasetName',
default='Vimeo_90K_interp',
choices=datasetNames,
nargs='+',
help='dataset type : ' + ' | '.join(datasetNames) +
' (default: Vimeo_90K_interp)')
parser.add_argument('--video_path',
default='',
help='the path of selected videos')
parser.add_argument('--output_path', default='', help='the output root path')
parser.add_argument('--seed',
type=int,
default=1,
help='random seed (default: 1)')
parser.add_argument('--batch_size',
'-b',
type=int,
default=1,
help='batch size (default:1)')
parser.add_argument('--channels',
'-c',
type=int,
default=3,
choices=[1, 3],
help='channels of images (default:3)')
parser.add_argument('--filter_size',
'-f',
type=int,
default=4,
help='the size of filters used (default: 4)',
choices=[2, 4, 6, 5, 51])
parser.add_argument('--time_step',
type=float,
default=0.5,
help='choose the time steps')
parser.add_argument(
'--alpha',
type=float,
nargs='+',
default=[0.0, 1.0],
help=
'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])'
)
parser.add_argument('--frame_rate',
type=int,
default=None,
help='frame rate of the input video')
parser.add_argument('--patience',
type=int,
default=5,
help='the patience of reduce on plateou')
parser.add_argument('--factor',
type=float,
default=0.2,
help='the factor of reduce on plateou')
parser.add_argument('--saved_model',
type=str,
default='',
help='path to the model weights')
parser.add_argument('--no-date',
action='store_true',
help='don\'t append date timestamp to folder')
parser.add_argument('--use_cuda',
default=True,
type=bool,
help='use cuda or not')
parser.add_argument('--use_cudnn', default=1, type=int, help='use cudnn or not')
args = parser.parse_args()
from .dain import DAIN
from .dain_slowmotion import DAIN_slowmotion
__all__ = ('DAIN', 'DAIN_slowmotion')
import paddle.fluid as fluid
import resblock
import pwcnet
class DAIN(fluid.dygraph.Layer):
def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True):
# base class initialization
super(DAIN, self).__init__()
self.filter_size = filter_size
self.training = training
self.timestep = timestep
assert (timestep == 0.5)
self.numFrames = int(1.0 / timestep) - 1
ctx_ch = 3 * 64 + 3
inplanes = 3 + 3 + 3 + 2 * 1 + 2 * 2 + 16 * 2 + 2 * ctx_ch
self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes,
64)
self.flownets = pwcnet.__dict__['pwc_dc_net']()
self.div_flow = 20.0
def forward(self, input):
"""
Parameters
----------
input: shape (3, batch, 3, width, height)
-----------
"""
losses = []
offsets = []
'''
STEP 1: sequeeze the input
'''
if self.training == True:
assert input.shape[0] == 3
input_0 = input[0]
input_1 = input[1]
input_2 = input[2]
else:
# print(input.shape[0])
assert input.shape[0] == 2
input_0 = input[0]
input_2 = input[1]
#prepare the input data of current scale
cur_input_0 = input_0
if self.training == True:
cur_input_1 = input_1
cur_input_2 = input_2
'''
STEP 3.2: concatenating the inputs.
'''
cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2],
axis=1)
'''
STEP 3.3: perform the estimation
'''
time_offsets = [
kk * self.timestep for kk in range(1, 1 + self.numFrames, 1)
]
cur_offset_outputs = [
self.forward_flownets(self.flownets,
cur_offset_input,
time_offsets=time_offsets),
self.forward_flownets(self.flownets,
fluid.layers.concat(
[cur_input_2, cur_input_0], axis=1),
time_offsets=time_offsets[::-1])
]
cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]]
# Warp image use warp-op in PWC-Net
ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0])
ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1])
cur_output = (ref0 + ref2) / 2.0
rectify_input = fluid.layers.concat([
cur_output, ref0, ref2, cur_offset_output[0], cur_offset_output[1]
],
axis=1)
cur_output_rectified = self.rectifyNet(rectify_input) + cur_output
'''
STEP 3.5: for training phase, we collect the variables to be penalized.
'''
if self.training == True:
losses += [cur_output - cur_input_1]
losses += [cur_output_rectified - cur_input_1]
offsets += [cur_offset_output]
'''
STEP 4: return the results
'''
if self.training == True:
# if in the training phase, we output the losses to be minimized.
# return losses, loss_occlusion
return losses, offsets
else:
cur_outputs = [cur_output, cur_output_rectified]
return cur_outputs, cur_offset_output
def forward_flownets(self, model, input, time_offsets=None):
if time_offsets == None:
time_offsets = [0.5]
elif type(time_offsets) == float:
time_offsets = [time_offsets]
elif type(time_offsets) == list:
pass
# this is a single direction motion results, but not a bidirectional one
temp = model(input)
# single direction to bidirection should haven it.
temps = [
self.div_flow * temp * time_offset for time_offset in time_offsets
]
# nearest interpolation won't be better i think
temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps]
return temps
import paddle.fluid as fluid
import resblock
import time
import pwcnet
class DAIN_slowmotion(fluid.dygraph.Layer):
def __init__(self, channel=3, filter_size=4, timestep=0.5, training=True):
# base class initialization
super(DAIN_slowmotion, self).__init__()
self.filter_size = filter_size
self.training = training
self.timestep = timestep
self.num_frames = int(1.0 / timestep) - 1
ctx_ch = 3 * 64 + 3
# inplanes = 3 + 3 + 3 + 2*1 + 2*2 + 2
inplanes = 13
self.flownets = pwcnet.__dict__['pwc_dc_net']()
self.rectifyNet = resblock.__dict__['MultipleBasicBlock_4'](inplanes,
64)
self.div_flow = 20.0
def forward(self, input):
"""
Parameters
----------
input: shape (3, batch, 3, width, height)
-----------
"""
losses = []
offsets = []
'''
STEP 1: sequeeze the input
'''
if self.training == True:
assert input.shape[0] == 3
input_0 = input[0]
input_1 = input[1]
input_2 = input[2]
else:
assert input.shape[0] == 2
input_0 = input[0]
input_2 = input[1]
#prepare the input data of current scale
cur_input_0 = input_0
if self.training == True:
cur_input_1 = input_1
cur_input_2 = input_2
'''
STEP 3.2: concatenating the inputs.
'''
cur_offset_input = fluid.layers.concat([cur_input_0, cur_input_2],
axis=1)
'''
STEP 3.3: perform the estimation
'''
time_offsets = [
kk * self.timestep for kk in range(1, 1 + self.num_frames, 1)
]
cur_offset_outputs = [
self.forward_flownets(self.flownets,
cur_offset_input,
time_offsets=time_offsets),
self.forward_flownets(self.flownets,
fluid.layers.concat(
[cur_input_2, cur_input_0], axis=1),
time_offsets=time_offsets[::-1])
]
'''
STEP 3.4: perform the frame interpolation process
'''
count = 0
for temp_0, temp_1, timeoffset in zip(cur_offset_outputs[0],
cur_offset_outputs[1],
time_offsets):
cur_offset_output = [temp_0, temp_1]
ref0 = self.flownets.warp_nomask(cur_input_0, cur_offset_output[0])
ref2 = self.flownets.warp_nomask(cur_input_2, cur_offset_output[1])
cur_output_temp = (ref0 + ref2) / 2.0
if count == 0:
cur_output = fluid.layers.unsqueeze(cur_output_temp, axes=0)
else:
cur_output_ = fluid.layers.unsqueeze(cur_output_temp, axes=0)
cur_output = fluid.layers.concat([cur_output, cur_output_],
axis=0)
rectify_input = fluid.layers.concat([
cur_output_temp, ref0, ref2, cur_offset_output[0],
cur_offset_output[1]
],
axis=1)
cur_output_rectified_temp = self.rectifyNet(
rectify_input) + cur_output_temp
if count == 0:
cur_output_rectified = fluid.layers.unsqueeze(
cur_output_rectified_temp, axes=0)
else:
cur_output_rectified_ = fluid.layers.unsqueeze(
cur_output_rectified_temp, axes=0)
cur_output_rectified = fluid.layers.concat(
[cur_output_rectified, cur_output_rectified_], axis=0)
count += 1
'''
STEP 3.5: for training phase, we collect the variables to be penalized.
'''
if self.training == True:
losses += [cur_output - cur_input_1]
losses += [cur_output_rectified - cur_input_1]
offsets += [cur_offset_output]
'''
STEP 4: return the results
'''
if self.training == True:
# if in the training phase, we output the losses to be minimized.
# return losses, loss_occlusion
return losses, offsets
else:
cur_outputs = [cur_output, cur_output_rectified]
return cur_outputs, cur_offset_output
def forward_flownets(self, model, input, time_offsets=None):
if time_offsets == None:
time_offsets = [0.5]
elif type(time_offsets) == float:
time_offsets = [time_offsets]
elif type(time_offsets) == list:
pass
# this is a single direction motion results, but not a bidirectional one
temp = model(input)
# single direction to bidirection should haven it.
temps = [
self.div_flow * temp * time_offset for time_offset in time_offsets
]
# nearest interpolation won't be better i think
temps = [fluid.layers.resize_bilinear(temp, scale=4) for temp in temps]
return temps
import os, sys
import math
import random
import time
import glob
import shutil
import numpy as np
from imageio import imread, imsave
import cv2
import paddle.fluid as fluid
import networks
from util import *
from my_args import args
def infer_engine(model_dir,
run_mode='fluid',
batch_size=1,
use_gpu=False,
min_subgraph_size=3):
if not use_gpu and not run_mode == 'fluid':
raise ValueError(
"Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}"
.format(run_mode, use_gpu))
precision_map = {
'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
}
config = fluid.core.AnalysisConfig(os.path.join(model_dir, 'model'),
os.path.join(model_dir, 'params'))
if use_gpu:
# initial GPU memory(M), device ID
config.enable_use_gpu(100, 0)
# optimize graph and fuse op
config.switch_ir_optim(True)
else:
config.disable_gpu()
if run_mode in precision_map.keys():
config.enable_tensorrt_engine(workspace_size=1 << 10,
max_batch_size=batch_size,
min_subgraph_size=min_subgraph_size,
precision_mode=precision_map[run_mode],
use_static=False,
use_calib_mode=False)
# disable print log when predict
config.disable_glog_info()
# enable shared memory
config.enable_memory_optim()
# disable feed, fetch OP, needed by zero_copy_run
config.switch_use_feed_fetch_ops(False)
predictor = fluid.core.create_paddle_predictor(config)
return predictor
def executor(model_dir, use_gpu=False):
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
program, feed_names, fetch_targets = fluid.io.load_inference_model(
dirname=model_dir,
executor=exe,
model_filename='model',
params_filename='params')
return exe, program, fetch_targets
class VideoFrameInterp(object):
def __init__(self,
time_step,
model_path,
video_path,
use_gpu=True,
key_frame_thread=0.,
output_path='output'):
self.video_path = video_path
self.output_path = output_path
self.model_path = model_path
self.time_step = time_step
self.key_frame_thread = key_frame_thread
self.exe, self.program, self.fetch_targets = executor(model_path,
use_gpu=use_gpu)
# self.predictor = load_predictor(
# model_dir,
# run_mode=run_mode,
# min_subgraph_size=3,
# use_gpu=use_gpu)
def run(self):
frame_path_input = os.path.join(self.output_path, 'frames-input')
frame_path_interpolated = os.path.join(self.output_path,
'frames-interpolated')
frame_path_combined = os.path.join(self.output_path, 'frames-combined')
video_path_output = os.path.join(self.output_path, 'videos-output')
if not os.path.exists(self.output_path):
os.makedirs(self.output_path)
if not os.path.exists(frame_path_input):
os.makedirs(frame_path_input)
if not os.path.exists(frame_path_interpolated):
os.makedirs(frame_path_interpolated)
if not os.path.exists(frame_path_combined):
os.makedirs(frame_path_combined)
if not os.path.exists(video_path_output):
os.makedirs(video_path_output)
timestep = self.time_step
num_frames = int(1.0 / timestep) - 1
if self.video_path.endswith('.mp4'):
videos = [self.video_path]
else:
videos = sorted(glob.glob(os.path.join(self.video_path, '*.mp4')))
for cnt, vid in enumerate(videos):
print("Interpolating video:", vid)
cap = cv2.VideoCapture(vid)
fps = cap.get(cv2.CAP_PROP_FPS)
print("Old fps (frame rate): ", fps)
times_interp = int(1.0 / timestep)
r2 = str(int(fps) * times_interp)
print("New fps (frame rate): ", r2)
out_path = dump_frames_ffmpeg(vid, frame_path_input)
vidname = vid.split('/')[-1].split('.')[0]
tot_timer = AverageMeter()
proc_timer = AverageMeter()
end = time.time()
frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
img = imread(frames[0])
int_width = img.shape[1]
int_height = img.shape[0]
channel = img.shape[2]
if not channel == 3:
continue
if int_width != ((int_width >> 7) << 7):
int_width_pad = (
((int_width >> 7) + 1) << 7) # more than necessary
padding_left = int((int_width_pad - int_width) / 2)
padding_right = int_width_pad - int_width - padding_left
else:
int_width_pad = int_width
padding_left = 32
padding_right = 32
if int_height != ((int_height >> 7) << 7):
int_height_pad = (
((int_height >> 7) + 1) << 7) # more than necessary
padding_top = int((int_height_pad - int_height) / 2)
padding_bottom = int_height_pad - int_height - padding_top
else:
int_height_pad = int_height
padding_top = 32
padding_bottom = 32
frame_num = len(frames)
print('processing {} frames, from video: {}'.format(frame_num, vid))
if not os.path.exists(os.path.join(frame_path_interpolated,
vidname)):
os.makedirs(os.path.join(frame_path_interpolated, vidname))
if not os.path.exists(os.path.join(frame_path_combined, vidname)):
os.makedirs(os.path.join(frame_path_combined, vidname))
for i in range(frame_num - 1):
print(frames[i])
first = frames[i]
second = frames[i + 1]
img_first = imread(first)
img_second = imread(second)
'''--------------Frame change test------------------------'''
img_first_gray = np.dot(img_first[..., :3],
[0.299, 0.587, 0.114])
img_second_gray = np.dot(img_second[..., :3],
[0.299, 0.587, 0.114])
img_first_gray = img_first_gray.flatten(order='C')
img_second_gray = img_second_gray.flatten(order='C')
corr = np.corrcoef(img_first_gray, img_second_gray)[0, 1]
key_frame = False
if corr < self.key_frame_thread:
key_frame = True
'''-------------------------------------------------------'''
X0 = img_first.astype('float32').transpose((2, 0, 1)) / 255
X1 = img_second.astype('float32').transpose((2, 0, 1)) / 255
if key_frame:
y_ = [
np.transpose(255.0 * X0.clip(0, 1.0), (1, 2, 0))
for i in range(num_frames)
]
else:
assert (X0.shape[1] == X1.shape[1])
assert (X0.shape[2] == X1.shape[2])
print("size before padding ", X0.shape)
X0 = np.pad(X0, ((0,0), (padding_top, padding_bottom), \
(padding_left, padding_right)), mode='edge')
X1 = np.pad(X1, ((0,0), (padding_top, padding_bottom), \
(padding_left, padding_right)), mode='edge')
print("size after padding ", X0.shape)
X0 = np.expand_dims(X0, axis=0)
X1 = np.expand_dims(X1, axis=0)
X0 = np.expand_dims(X0, axis=0)
X1 = np.expand_dims(X1, axis=0)
X = np.concatenate((X0, X1), axis=0)
proc_end = time.time()
o = self.exe.run(self.program,
fetch_list=self.fetch_targets,
feed={"image": X})
y_ = o[0]
proc_timer.update(time.time() - proc_end)
tot_timer.update(time.time() - end)
end = time.time()
print("*********** current image process time \t " +
str(time.time() - proc_end) + "s *********")
y_ = [
np.transpose(
255.0 * item.clip(
0, 1.0)[0, :,
padding_top:padding_top + int_height,
padding_left:padding_left + int_width],
(1, 2, 0)) for item in y_
]
time_offsets = [
kk * timestep for kk in range(1, 1 + num_frames, 1)
]
count = 1
for item, time_offset in zip(y_, time_offsets):
out_dir = os.path.join(
frame_path_interpolated, vidname,
"{:0>4d}_{:0>4d}.png".format(i, count))
count = count + 1
imsave(out_dir, np.round(item).astype(np.uint8))
num_frames = int(1.0 / timestep) - 1
input_dir = os.path.join(frame_path_input, vidname)
interpolated_dir = os.path.join(frame_path_interpolated, vidname)
combined_dir = os.path.join(frame_path_combined, vidname)
combine_frames(input_dir, interpolated_dir, combined_dir,
num_frames)
frame_pattern_combined = os.path.join(frame_path_combined, vidname,
'%08d.png')
video_pattern_output = os.path.join(video_path_output,
vidname + '.mp4')
if os.path.exists(video_pattern_output):
os.remove(video_pattern_output)
frames_to_video_ffmpeg(frame_pattern_combined, video_pattern_output,
r2)
if __name__ == '__main__':
predictor = VideoFrameInterp(args.time_step, args.saved_model,
args.video_path, args.output_path)
predictor.run()
自定义OP编译:
2. sh make.sh编译成correlation_lib.so动态库
3. 添加动态库路径到LD_LIBRARY_PATH:
```
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python3.7 -c 'import paddle; print(paddle.sysconfig.get_lib())'`
```
4. 添加correlation op的python路径:
```
export PYTHONPATH=$PYTHONPATH:`pwd`
```
5. python test_correlation.py运行单测,验证是否加载成功。
PS: 如果paddle whl包是从官网上下载的,需要使用gcc 4.8,即把make.sh中的g++ 改为 g++-4.8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import os
file_dir = os.path.dirname(os.path.abspath(__file__))
fluid.load_op_library(os.path.join(file_dir, 'correlation_lib.so'))
from paddle.fluid.layer_helper import LayerHelper
def correlation(input1,
input2,
pad_size,
kernel_size,
max_displacement,
stride1,
stride2,
corr_type_multiply=1):
helper = LayerHelper("correlation", **locals())
output = helper.create_variable_for_type_inference(dtype=input1.dtype)
helper.append_op(type="correlation",
inputs={
"Input1": input1,
"Input2": input2
},
attrs={
"pad_size": pad_size,
"kernel_size": kernel_size,
"max_displacement": max_displacement,
"stride1": stride1,
"stride2": stride2,
"corr_type_multiply": corr_type_multiply
},
outputs={"Output": output})
return output
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height, int input_width, int stride1, int stride2, int kernel_size, int pad_size, int max_displacement) {
std::vector<int64_t> output_shape({batch});
int kernel_radius = (kernel_size - 1) / 2;
int border_radius = kernel_radius + max_displacement;
int padded_input_height = input_height + 2 * pad_size;
int padded_input_width = input_width + 2 * pad_size;
int output_channel = ((max_displacement/stride2) * 2 + 1) * ((max_displacement/stride2) * 2 + 1);
output_shape.push_back(output_channel);
int output_height = std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) / static_cast<float>(stride1));
int output_width = std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) / static_cast<float>(stride1));
output_shape.push_back(output_height);
output_shape.push_back(output_width);
return output_shape;
}
class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override{
AddInput("Input1", "input1");
AddInput("Input2", "input2");
AddOutput("Output", "output");
AddAttr<int>("pad_size", "pad size for input1 and input2");
AddAttr<int>("kernel_size", "kernel size of input1 and input2");
AddAttr<int>("max_displacement", "max displacement of input1 and input2");
AddAttr<int>("stride1", "Input1 stride");
AddAttr<int>("stride2", "Input2 stride");
AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
AddComment(R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
}
};
class CorrelationOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override{
PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(input1) cannot be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(input2) cannot be null");
int stride1 = ctx->Attrs().Get<int>("stride1");
int stride2 = ctx->Attrs().Get<int>("stride2");
int max_displacement = ctx->Attrs().Get<int>("max_displacement");
int pad_size = ctx->Attrs().Get<int>("pad_size");
int kernel_size = ctx->Attrs().Get<int>("kernel_size");
auto in_dims = ctx->GetInputDim("Input1");
auto in2_dims = ctx->GetInputDim("Input2");
PADDLE_ENFORCE_EQ(in_dims.size() == 4, true, "input1 must be 4-dims");
PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true, "input2 must be 4-dims");
std::vector<int64_t> output_shape = CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1, stride2, kernel_size, pad_size, max_displacement);
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override{
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(), "Input1 and Input2 shoule have same type");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
template <typename T>
class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("correlation_grad");
op->SetInput("Input1", this->Input("Input1"));
op->SetInput("Input2", this->Input("Input2"));
op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
op->SetAttrMap(this->Attrs());
}
};
class CorrelationOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override{
PADDLE_ENFORCE_EQ(ctx->HasInput("Input1"), true, "Input(Input1) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("Input2"), true, "Input(Input2) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Output")), true, "Input(Output@GRAD) should not be null");
auto in1_dims = ctx->GetInputDim("Input1");
auto in2_dims = ctx->GetInputDim("Input2");
ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
ctx->SetOutputDim(framework::GradVarName("Input2"), in1_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override{
const auto* var = ctx.InputVar(framework::GradVarName("Output"));
if (var == nullptr) {
PADDLE_THROW("cannot find Output@GRAD");
}
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#define THREADS_PER_BLOCK 32
#define FULL_MASK 0xffffffff
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
__forceinline__ __device__ T warpReduceSum(T val) {
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down_sync(FULL_MASK, val, offset);
}
return val;
}
template <typename T>
__forceinline__ __device__ T blockReduceSum(T val) {
static __shared__ T shared[32];
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val);
if (lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid == 0)
val = warpReduceSum(val);
return val;
}
template <typename T>
__global__ void set_zero(T *x, int num) {
for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x)
x[i] = static_cast<T>(0);
}
template <typename T>
__global__ void channel_first(const T *input, T *rinput, const int channel, const int height, const int width, const int pad_size) {
int n = blockIdx.x;
int h = blockIdx.y;
int w = blockIdx.z;
int ch_off = threadIdx.x;
T value;
int dimchw = channel * height * width;
int dimhw = height * width;
int p_dimw = (width + 2 * pad_size);
int p_dimh = (height + 2 * pad_size);
int p_dimchw = channel * p_dimw * p_dimh;
int p_dimcw = channel * p_dimw;
for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
value = input[n * dimchw + c * dimhw + h * width + w];
rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel + c] = value;
}
}
template <typename T>
__global__ void correlation_forward(T *output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int input_channel, const int input_height, const int input_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) {
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int n = blockIdx.x;
int h1 = blockIdx.y * stride1 + max_displacement;
int w1 = blockIdx.z * stride1 + max_displacement;
int c = threadIdx.x;
int p_dimchw = p_input_height * p_input_width * input_channel;
int p_dimcw = p_input_width * input_channel;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int nelems = kernel_size * kernel_size * p_dimc;
for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
for(int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
int w2 = w1 + ti * stride2;
int h2 = h1 + tj * stride2;
T acc0 = 0;
for(int j = -kernel_rad; j <= kernel_rad; ++j) {
for(int i = -kernel_rad; i <= kernel_rad; ++i) {
for(int ch = c; ch < p_dimc; ch += blockDim.x) {
int index1 = n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
int index2 = n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
}
}
}
if (blockDim.x == warpSize) {
__syncwarp();
acc0 = warpReduceSum(acc0);
} else {
__syncthreads();
acc0 = blockReduceSum(acc0);
}
if (threadIdx.x == 0) {
int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad);
const int t_index = n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
output[t_index] = static_cast<T>(acc0 / nelems);
}
}
}
}
//class CorrelationKernel<platform::CUDADeviceContext, T>
template <typename T>
class CorrelationKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must be CUDAPlace");
auto *input1 = ctx.Input<Tensor>("Input1");
auto *input2 = ctx.Input<Tensor>("Input2");
int pad_size = ctx.Attr<int>("pad_size");
int kernel_size = ctx.Attr<int>("kernel_size");
int stride1 = ctx.Attr<int>("stride1");
int stride2 = ctx.Attr<int>("stride2");
int max_displacement = ctx.Attr<int>("max_displacement");
int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
auto *output = ctx.Output<Tensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// base on input1, NCHW
auto in_dims = input1->dims();
int N = in_dims[0];
int C = in_dims[1];
int H = in_dims[2];
int W = in_dims[3];
int padded_input_height = H + 2 * pad_size;
int padded_input_width = W + 2 * pad_size;
Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput1.mutable_data<T>(ctx.GetPlace());
Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput2.mutable_data<T>(ctx.GetPlace());
set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data<T>(), rinput1.numel());
set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data<T>(), rinput2.numel());
set_zero<<<(output->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(output->data<T>(), output->numel());
auto out_dims = output->dims();
int OC = out_dims[1];
int OH = out_dims[2];
int OW = out_dims[3];
dim3 blocks_grid(N, H, W);
dim3 threads_block(THREADS_PER_BLOCK);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(N, OH, OW);
correlation_forward<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(output->data<T>(), OC, OH, OW, rinput1.data<T>(),
C, H, W, rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
}
};
template <typename T>
__global__ void correlation_backward_input1(int item, T *grad_input1, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) {
int n = item;
int h = blockIdx.x * stride1 + pad_size;
int w = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int xmin = (w - kernel_rad - max_displacement) / stride1;
int ymin = (h - kernel_rad - max_displacement) / stride1;
int xmax = (w + kernel_rad - max_displacement) / stride1;
int ymax = (h + kernel_rad - max_displacement) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
return;
}
if (xmin > xmax || ymin > ymax) {
return;
}
xmin = max(0, xmin);
xmax = min(output_width - 1, xmax);
ymin = max(0, ymin);
ymax = min(output_height - 1, ymax);
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int p_dimchw = input_channel * p_input_height * p_input_width;
int p_dimcw = input_channel * p_input_width;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int o_dimchw = input_channel * input_height * input_width;
int o_dimhw = input_height * input_width;
int o_dimw = input_width;
int nelems = kernel_size * kernel_size * input_channel;
__shared__ T prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
T val2 = rinput2[index2];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
prod_sum[tch_off] += grad_output[t_index] * val2;
}
}
}
__syncthreads();
if (tch_off == 0) {
T reduce_sum = 0;
for (int index = 0; index < THREADS_PER_BLOCK; index++) {
reduce_sum += prod_sum[index];
}
const int index1 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
}
}
template <typename T>
__global__ void correlation_backward_input2(int item, T *grad_input2, const int input_channel, const int input_height, const int input_width, const T *grad_output, const int output_channel, const int output_height, const int output_width, const T *rinput1, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2){
int n = item;
int h = blockIdx.x * stride1 + pad_size;
int w = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int p_dimchw = input_channel * p_input_height * p_input_width;
int p_dimcw = input_channel * p_input_width;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int o_dimchw = input_channel * input_height * input_width;
int o_dimhw = input_height * input_width;
int o_dimw = input_width;
int nelems = kernel_size * kernel_size * input_channel;
__shared__ T prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
continue;
}
if (xmin > xmax || ymin > ymax) {
continue;
}
xmin = max(0, xmin);
xmax = min(output_width - 1, xmax);
ymin = max(0, ymin);
ymax = min(output_height - 1, ymax);
int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
T val1 = rinput1[index1];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
prod_sum[tch_off] += grad_output[t_index] * val1;
}
}
}
__syncthreads();
if (tch_off == 0) {
T reduce_sum = 0;
for (int index = 0; index < THREADS_PER_BLOCK; index++) {
reduce_sum += prod_sum[index];
}
const int index2 = n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
}
}
template <typename T>
class CorrelationGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, "It must use CUDAPlace.");
const auto *input1 = ctx.Input<Tensor>("Input1");
const auto *input2 = ctx.Input<Tensor>("Input2");
const auto *grad_output = ctx.Input<Tensor>(framework::GradVarName("Output"));
const int pad_size = ctx.Attr<int>("pad_size");
const int kernel_size = ctx.Attr<int>("kernel_size");
const int stride1 = ctx.Attr<int>("stride1");
const int stride2 = ctx.Attr<int>("stride2");
const int max_displacement = ctx.Attr<int>("max_displacement");
const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
grad_input1->mutable_data<T>(ctx.GetPlace());
auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
grad_input2->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto in_dims = input1->dims();
int N = in_dims[0];
int C = in_dims[1];
int H = in_dims[2];
int W = in_dims[3];
int padded_input_height = H + 2 * pad_size;
int padded_input_width = W + 2 * pad_size;
Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput1.mutable_data<T>(ctx.GetPlace());
Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput2.mutable_data<T>(ctx.GetPlace());
set_zero<<<(rinput1.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput1.data<T>(), rinput1.numel());
set_zero<<<(rinput2.numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(rinput2.data<T>(), rinput2.numel());
set_zero<<<(grad_input1->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input1->data<T>(), grad_input1->numel());
set_zero<<<(grad_input2->numel() + 512 - 1)/512, 512, 0, dev_ctx.stream()>>>(grad_input2->data<T>(), grad_input2->numel());
auto grad_out_dims = grad_output->dims();
int GOC = grad_out_dims[1];
int GOH = grad_out_dims[2];
int GOW = grad_out_dims[3];
dim3 blocks_grid(N, H, W);
dim3 threads_block(THREADS_PER_BLOCK);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(H, W, C);
for (int n = 0; n < N; n++) {
correlation_backward_input1<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH, GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
}
for (int n = 0; n < N; n++) {
correlation_backward_input2<T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH, GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement, stride1, stride2);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
correlation, ops::CorrelationKernel<float>,
ops::CorrelationKernel<double>);
REGISTER_OP_CUDA_KERNEL(
correlation_grad, ops::CorrelationGradKernel<float>,
ops::CorrelationGradKernel<double>);
# source /ssd1/vis/liufanglong/.bashrc
#export PATH=/home/work/cuda-9.0/bin:$PATH
#export PATH=/home/work/cuda-9.0/bin:$PATH
#export LD_LIBRARY_PATH="/home/work/cuda-9.0/lib64:$LD_LIBRARY_PATH"
#export LD_LIBRARY_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/lib64:$LD_LIBRARY_PATH
#export CPLUS_INCLUDE_PATH=/home/vis/chao/local/cudnn_v7.6/cuda/include:/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/include:$CPLUS_INCLUDE_PATH
#export LD_LIBRARY_PATH=/ssd1/vis/liufanglong/local/fluid_1.1.0_for_slurm/nccl_2.3.5/lib:$LD_LIBRARY_PATH
include_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_include())' )
lib_dir=$( python -c 'import paddle; print(paddle.sysconfig.get_lib())' )
echo $include_dir
echo $lib_dir
OPS='correlation_op'
for op in ${OPS}
do
nvcc ${op}.cu -c -o ${op}.cu.o -ccbin cc -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO -DPADDLE_WITH_MKLDNN -Xcompiler -fPIC -std=c++11 -Xcompiler -fPIC -w --expt-relaxed-constexpr -O0 -g -DNVCC \
-I ${include_dir}/third_party/ \
-I ${include_dir}
done
# g++-4.8 correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
# g++ ${OPS}.cu.o ${OPS}.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
g++ correlation_op.cu.o correlation_op.cc -o correlation_lib.so -DPADDLE_WITH_MKLDNN -shared -fPIC -std=c++11 -O0 -g \
-I ${include_dir}/third_party/ \
-I ${include_dir} \
-L ${lib_dir} \
-L /usr/local/cuda/lib64/ -lpaddle_framework -lcudart
# rm *.cu.o
import unittest
from correlation import correlation
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
def corr(x_1,
x_2,
pad_size=4,
kernel_size=1,
max_displacement=4,
stride1=1,
stride2=1,
corr_multiply=1):
K = kernel_size
# rinput1 = np.pad(x_1, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3)
# rinput2 = np.pad(x_2, tuple([pad_size for _ in range(4)]), mode='constant').transpose(1, 2).transpose(2, 3)
rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size),
(pad_size, pad_size)),
mode='constant')
rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size),
(pad_size, pad_size)),
mode='constant')
rinput1 = np.transpose(rinput1, (0, 2, 3, 1))
rinput2 = np.transpose(rinput2, (0, 2, 3, 1))
B = int(rinput1.shape[0])
H = int(x_1.shape[2])
W = int(x_2.shape[3])
d = max_displacement
D = 2 * d + 1
output = np.zeros((B, D * D, H, W), dtype=np.float32)
for b in range(B):
for i in range(H):
for j in range(W):
for k in range(-d, d + 1):
for l in range(-d, d + 1):
x1_index = i + pad_size
y1_index = j + pad_size
x2_index = x1_index + k
y2_index = y1_index + l
output[b, l + d + D * (k + d), i,
j] = np.mean(rinput1[b, x1_index:x1_index + K,
y1_index:y1_index + K] *
rinput2[b, x2_index:x2_index + K,
y2_index:y2_index + K])
return output
class TestCorrelationOp(unittest.TestCase):
def test_check_output(self):
#x_shape = (1, 196, 3, 3)
np.random.seed(13)
np.set_printoptions(threshold=np.inf)
x_shape = (2, 10, 3, 3)
x_type = 'float32'
x1 = fluid.layers.data(name='x1',
shape=x_shape,
dtype=x_type,
append_batch_size=False)
x2 = fluid.layers.data(name='x2',
shape=x_shape,
dtype=x_type,
append_batch_size=False)
x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
out_np = corr(x1_np,
x2_np,
pad_size=4,
kernel_size=1,
max_displacement=4,
stride1=1,
stride2=1)
out = correlation(x1,
x2,
pad_size=4,
kernel_size=1,
max_displacement=4,
stride1=1,
stride2=1)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
res = exe.run(feed={'x1': x1_np, 'x2': x2_np}, fetch_list=[out.name])
self.assertTrue(np.allclose(res[0], out_np))
class Net(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(Net, self).__init__(name_scope)
def forward(self, x1, x2):
y = correlation(x1,
x2,
pad_size=4,
kernel_size=1,
max_displacement=4,
stride1=1,
stride2=1)
return y
class TestCorrelationOpDyGraph(unittest.TestCase):
def test_check_output(self):
np.random.seed(13)
np.set_printoptions(threshold=np.inf)
x_shape = (2, 10, 3, 3)
x_type = 'float32'
place = fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
out_np = corr(x1_np,
x2_np,
pad_size=4,
kernel_size=1,
max_displacement=4,
stride1=1,
stride2=1)
x1 = to_variable(x1_np)
x2 = to_variable(x2_np)
corr_pd = Net('corr_pd')
y = corr_pd(x1, x2)
out = y.numpy()
self.assertTrue(np.allclose(out, out_np))
if __name__ == '__main__':
unittest.main()
此差异已折叠。
import paddle.fluid as fluid
from paddle.fluid.dygraph import Conv2D
__all__ = ['MultipleBasicBlock', 'MultipleBasicBlock_4']
def conv3x3(in_planes, out_planes, dilation=1, stride=1, param_attr=None):
return Conv2D(in_planes,
out_planes,
filter_size=3,
stride=stride,
padding=int(dilation * (3 - 1) / 2),
dilation=dilation,
bias_attr=False,
param_attr=param_attr)
class BasicBlock(fluid.dygraph.Layer):
expansion = 1
def __init__(self, inplanes, planes, dilation=1, stride=1, downsample=None):
super(BasicBlock, self).__init__()
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=1.0, seed=0))
self.conv1 = conv3x3(inplanes, planes, dilation, stride, param_attr)
self.conv2 = conv3x3(planes, planes, param_attr=param_attr)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
# out = self.bn1(out)
out = fluid.layers.relu(out)
out = self.conv2(out)
# out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = fluid.layers.relu(out)
return out
class MultipleBasicBlock(fluid.dygraph.Layer):
def __init__(self,
input_feature,
block,
num_blocks,
intermediate_feature=64,
dense=True):
super(MultipleBasicBlock, self).__init__()
self.dense = dense
self.num_block = num_blocks
self.intermediate_feature = intermediate_feature
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=1.0, seed=0))
self.block1 = Conv2D(input_feature,
intermediate_feature,
filter_size=7,
stride=1,
padding=3,
bias_attr=True,
param_attr=param_attr)
dim = intermediate_feature
self.block2 = block(dim, dim, dilation=1) if num_blocks >= 2 else None
self.block3 = block(dim, dim, dilation=1) if num_blocks >= 3 else None
self.block4 = block(dim, dim, dilation=1) if num_blocks >= 4 else None
self.block5 = Conv2D(dim, 3, 3, 1, 1)
def forward(self, x):
x = fluid.layers.relu(self.block1(x))
x = self.block2(x) if self.num_block >= 2 else x
x = self.block3(x) if self.num_block >= 3 else x
x = self.block4(x) if self.num_block >= 4 else x
x = self.block5(x)
return x
def MultipleBasicBlock_4(input_feature, intermediate_feature=64):
model = MultipleBasicBlock(input_feature, BasicBlock, 4,
intermediate_feature)
return model
cd pwcnet/correlation_op
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`python -c 'import paddle; print(paddle.sysconfig.get_lib())'`
export PYTHONPATH=$PYTHONPATH:`pwd`
cd ../../
VID_PATH=/paddle/work/github/DAIN/data/CBA.mp4
OUT_PATH=output
MODEL_PATH=DAIN_paddle_weight
#CUDA_VISIBLE_DEVICES=1 python demo.py \
# --time_step 0.125 \
# --video_path=$VID_PATH \
# --output_path=$OUT_PATH \
# --saved_model=$MODEL_PATH
CUDA_VISIBLE_DEVICES=2 python predict.py \
--time_step 0.125 \
--video_path=$VID_PATH \
--output_path=$OUT_PATH \
--saved_model=$MODEL_PATH
import os, sys
import glob
import shutil
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def dump_frames_ffmpeg(vid_path, outpath, r=None, ss=None, t=None):
ffmpeg = ['ffmpeg ', ' -loglevel ', ' error ']
vid_name = vid_path.split('/')[-1].split('.')[0]
out_full_path = os.path.join(outpath, vid_name)
if not os.path.exists(out_full_path):
os.makedirs(out_full_path)
# video file name
outformat = out_full_path + '/%08d.png'
if ss is not None and t is not None and r is not None:
cmd = ffmpeg + [
' -ss ',
ss,
' -t ',
t,
' -i ',
vid_path,
' -r ',
r,
# ' -f ', ' image2 ',
# ' -s ', ' 960*540 ',
' -qscale:v ',
' 0.1 ',
' -start_number ',
' 0 ',
# ' -qmax ', ' 1 ',
outformat
]
else:
cmd = ffmpeg + [' -i ', vid_path, ' -start_number ', ' 0 ', outformat]
cmd = ''.join(cmd)
print(cmd)
if os.system(cmd) == 0:
print('Video: {} done'.format(vid_name))
else:
print('Video: {} error'.format(vid_name))
print('')
sys.stdout.flush()
return out_full_path
def frames_to_video_ffmpeg(framepath, videopath, r):
ffmpeg = ['ffmpeg ', ' -loglevel ', ' error ']
cmd = ffmpeg + [
' -r ', r, ' -f ', ' image2 ', ' -i ', framepath, ' -vcodec ',
' libx264 ', ' -pix_fmt ', ' yuv420p ', ' -crf ', ' 16 ', videopath
]
cmd = ''.join(cmd)
print(cmd)
if os.system(cmd) == 0:
print('Video: {} done'.format(videopath))
else:
print('Video: {} error'.format(videopath))
print('')
sys.stdout.flush()
def combine_frames(input, interpolated, combined, num_frames):
frames1 = sorted(glob.glob(os.path.join(input, '*.png')))
frames2 = sorted(glob.glob(os.path.join(interpolated, '*.png')))
num1 = len(frames1)
num2 = len(frames2)
# assert (num1 - 1) * num_frames == num2
for i in range(num1):
src = frames1[i]
imgname = int(src.split('/')[-1].split('.')[-2])
assert i == imgname
dst = os.path.join(combined, '{:08d}.png'.format(i * (num_frames + 1)))
shutil.copy2(src, dst)
if i < num1 - 1:
for k in range(num_frames):
src = frames2[i * num_frames + k]
dst = os.path.join(
combined, '{:08d}.png'.format(i * (num_frames + 1) + k + 1))
shutil.copy2(src, dst)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册