提交 a7ee7f0f 编写于 作者: L Liangliang He

Merge branch 'conv-pad' into 'master'

Bug Fix

See merge request !257
...@@ -135,6 +135,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -135,6 +135,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_; T relux_max_limit_;
T prelu_alpha_; T prelu_alpha_;
cl::Kernel kernel_; cl::Kernel kernel_;
std::string tuning_key_prefix_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -228,11 +228,12 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -228,11 +228,12 @@ struct Conv2dFunctor : Conv2dFunctorBase {
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, strides_, input->shape().data(), filter->shape().data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings_.data()); padding_type_, output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
output->Resize(output_shape); output->Resize(output_shape);
...@@ -260,13 +261,13 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -260,13 +261,13 @@ struct Conv2dFunctor : Conv2dFunctorBase {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
int padded_height = input_height + paddings_[0]; int padded_height = input_height + paddings[0];
int padded_width = input_width + paddings_[1]; int padded_width = input_width + paddings[1];
Tensor padded_input; Tensor padded_input;
// Keep this alive during kernel execution // Keep this alive during kernel execution
if (paddings_[0] > 0 || paddings_[1] > 0) { if (paddings[0] > 0 || paddings[1] > 0) {
ConstructNHWCInputWithPadding(input, paddings_.data(), &padded_input); ConstructNHWCInputWithPadding(input, paddings.data(), &padded_input);
input = &padded_input; input = &padded_input;
} }
......
...@@ -294,11 +294,12 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -294,11 +294,12 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
fake_filter_shape[3] = 1; fake_filter_shape[3] = 1;
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations_, strides_, input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings_.data()); padding_type_, output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
auto input_shape = fake_filter_shape; auto input_shape = fake_filter_shape;
output->Resize(output_shape); output->Resize(output_shape);
...@@ -329,10 +330,10 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -329,10 +330,10 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int paddings_top = paddings_[0] / 2; int paddings_top = paddings[0] / 2;
int paddings_bottom = paddings_[0] - paddings_top; int paddings_bottom = paddings[0] - paddings_top;
int paddings_left = paddings_[1] / 2; int paddings_left = paddings[1] / 2;
int paddings_right = paddings_[1] - paddings_left; int paddings_right = paddings[1] - paddings_left;
int padded_h_start = 0 - paddings_top; int padded_h_start = 0 - paddings_top;
int padded_w_start = 0 - paddings_left; int padded_w_start = 0 - paddings_left;
......
...@@ -22,7 +22,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -22,7 +22,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channels = input->dim(3); const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
std::string tuning_key_prefix;
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -35,23 +34,23 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -35,23 +34,23 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
switch (activation_) { switch (activation_) {
case RELU: case RELU:
tuning_key_prefix = "relu_opencl_kernel_"; tuning_key_prefix_ = "relu_opencl_kernel_";
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: case RELUX:
tuning_key_prefix = "relux_opencl_kernel_"; tuning_key_prefix_ = "relux_opencl_kernel_";
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU: case PRELU:
tuning_key_prefix = "prelu_opencl_kernel_"; tuning_key_prefix_ = "prelu_opencl_kernel_";
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH: case TANH:
tuning_key_prefix = "tanh_opencl_kernel_"; tuning_key_prefix_ = "tanh_opencl_kernel_";
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: case SIGMOID:
tuning_key_prefix = "sigmoid_opencl_kernel_"; tuning_key_prefix_ = "sigmoid_opencl_kernel_";
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
default: default:
...@@ -60,12 +59,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -60,12 +59,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_ = kernel_ =
runtime->BuildKernel("activation", kernel_name, built_options); runtime->BuildKernel("activation", kernel_name, built_options);
int idx = 0; int idx = 0;
kernel_.setArg( kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_)); kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(prelu_alpha_)); kernel_.setArg(idx++, static_cast<float>(prelu_alpha_));
kernel_.setArg(idx++, kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
*(static_cast<cl::Image2D *>(output->buffer())));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -73,7 +70,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -73,7 +70,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix, output->dim(0), output->dim(1), Concat(tuning_key_prefix_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
} }
......
...@@ -68,7 +68,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -68,7 +68,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 || if (!input->is_image() || strides_[0] != strides_[1] ||
((kernel_h == 1 || kernel_h == 3) && strides_[0] > 2) ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
...@@ -80,11 +81,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -80,11 +81,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, strides_, input->shape().data(), filter->shape().data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings_.data()); padding_type_, output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
...@@ -95,11 +97,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -95,11 +97,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
selector[kernel_h - 1] != nullptr && selector[kernel_h - 1] != nullptr &&
0 < strides_[0] && strides_[0] < 3 ) { 0 < strides_[0] && strides_[0] < 3 ) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, activation_, conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
relux_max_limit_, prelu_alpha_, DataTypeToEnum<T>::value, relux_max_limit_, prelu_alpha_, DataTypeToEnum<T>::value,
output, future); output, future);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, prelu_alpha_, activation_, relux_max_limit_, prelu_alpha_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} }
......
...@@ -153,18 +153,19 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -153,18 +153,19 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
fake_filter_shape[3] = 1; fake_filter_shape[3] = 1;
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations_, strides_, input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings_.data()); padding_type_, output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, prelu_alpha_, activation_, relux_max_limit_, prelu_alpha_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} }
......
...@@ -23,12 +23,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -23,12 +23,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input->dim(3), input->dim(3) input->dim(3), input->dim(3)
}; };
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), input->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_type_, dilations_, strides_, this->padding_type_,
output_shape.data(), paddings_.data()); output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
...@@ -66,8 +67,8 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -66,8 +67,8 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(out_height)); kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, paddings_[0] / 2); kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings_[1] / 2); kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides_[0]); kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, kernels_[0]); kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
......
...@@ -17,11 +17,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -17,11 +17,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
strides_.data(), padding_type_, output_shape.data(), paddings_.data()); strides_.data(), padding_type_, output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h = (output_shape[1] + 1) / 2;
...@@ -52,8 +53,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -52,8 +53,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings_[0] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings_[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
} }
const uint32_t gws[2] = {static_cast<uint32_t>(out_width), const uint32_t gws[2] = {static_cast<uint32_t>(out_width),
......
...@@ -64,13 +64,13 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -64,13 +64,13 @@ struct PoolingFunctor : PoolingFunctorBase {
input_tensor->dim(3), input_tensor->dim(3) input_tensor->dim(3), input_tensor->dim(3)
}; };
if (paddings_.empty()) { std::vector<int> paddings(2);
paddings_.resize(2);
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_type_, dilations_, strides_, this->padding_type_,
output_shape.data(), paddings_.data()); output_shape.data(), paddings.data());
if (!paddings_.empty()) {
paddings = paddings_;
} }
output_tensor->Resize(output_shape); output_tensor->Resize(output_shape);
...@@ -99,8 +99,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -99,8 +99,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int dilation_w = dilations_[1]; int dilation_w = dilations_[1];
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings_[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
if (pooling_type_ == MAX) { if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册