提交 48e6ee96 编写于 作者: L liutuo

add 2x2s2_deconv for neon and optimize deconv and dw_deconv ops' structure

上级 fc7f4967
......@@ -22,3 +22,9 @@ mace/examples/android/macelibrary/src/main/cpp/mace/
*swp
*~
.python-version
mace/examples/android/macelibrary/src/main/cpp/include/mace/public/
mace/examples/android/macelibrary/src/main/cpp/lib/arm64-v8a/
mace/examples/android/macelibrary/src/main/jniLibs/arm64-v8a/
......@@ -21,6 +21,18 @@
namespace mace {
namespace ops {
void Deconv2dNeonK2x2S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Deconv2dNeonK2x2S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Deconv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t *in_shape,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/ops/arm/deconv_2d_neon.h"
namespace mace {
namespace ops {
void Deconv2dNeonK2x2S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
float *out_base0 = output + (b * outch + oc) * out_img_size;
float *out_base1 = out_base0 + out_img_size;
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base0 = filter + (oc * inch + ic) * 4;
const float *kernel_base1 = kernel_base0 + inch * 4;
const float *in = input_base;
// output channel 0
const float *k0 = kernel_base0;
// output channel 1
const float *k1 = kernel_base1;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base0 = out_base0 + i * outw;
float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw;
float *out_row_base1 = out_base1 + i * outw;
float *out_row1_0 = out_row_base1;
float *out_row1_1 = out_row_base1 + outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03;
float32x4_t out10, out11, out12, out13;
out00 = vld1q_f32(out_row0_0);
out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
vst1q_f32(out_row0_0, out00);
out01 = vld1q_f32(out_row0_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
vst1q_f32(out_row0_0 + 1, out01);
out02 = vld1q_f32(out_row0_1);
out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
vst1q_f32(out_row0_1, out02);
out03 = vld1q_f32(out_row0_1 + 1);
out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
vst1q_f32(out_row0_1 + 1, out03);
out10 = vld1q_f32(out_row1_0);
out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
vst1q_f32(out_row1_0, out10);
out11 = vld1q_f32(out_row1_0 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
vst1q_f32(out_row1_0 + 1, out11);
out12 = vld1q_f32(out_row1_1);
out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
vst1q_f32(out_row1_1, out12);
out13 = vld1q_f32(out_row1_1 + 1);
out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
vst1q_f32(out_row1_1 + 1, out13);
in += 4;
out_row0_0 += 4;
out_row0_1 += 4;
out_row1_0 += 4;
out_row1_1 += 4;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 2; ++k) {
out_row0_0[k] += val * k0[k];
out_row0_1[k] += val * k0[k + 2];
out_row1_0[k] += val * k1[k];
out_row1_1[k] += val * k1[k + 2];
}
in++;
out_row0_0++;
out_row0_1++;
out_row1_0++;
out_row1_1++;
}
}
}
} else {
float *out_base0 = output + (b * outch + oc) * outh * outw;
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base0 = filter + (oc * inch + ic) * 4;
const float *in = input_base;
const float *k0 = kernel_base0;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t k0_vec = vld1q_f32(k0);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base0 = out_base0 + i * outw;
float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03;
out00 = vld1q_f32(out_row0_0);
out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
vst1q_f32(out_row0_0, out00);
out01 = vld1q_f32(out_row0_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
vst1q_f32(out_row0_0 + 1, out01);
out02 = vld1q_f32(out_row0_1);
out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
vst1q_f32(out_row0_1, out02);
out03 = vld1q_f32(out_row0_1 + 1);
out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
vst1q_f32(out_row0_1 + 1, out03);
in += 4;
out_row0_0 += 4;
out_row0_1 += 4;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 2; ++k) {
out_row0_0[k] += val * k0[k];
out_row0_1[k] += val * k0[k + 2];
}
in++;
out_row0_0++;
out_row0_1++;
}
}
}
}
}
}
}
void Deconv2dNeonK2x2S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size;
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base = filter + (oc * inch + ic) * 4;
const float *in = input_base;
const float *k0 = kernel_base;
#if defined(MACE_ENABLE_NEON)
float32x4_t k0_vec = vld1q_f32(k0);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base = out_base + i * 2 * outw;
float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
// out row 0
float32x4x2_t out00 = vld2q_f32(out_row_0);
out00.val[0] =
neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
out00.val[1] =
neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
vst2q_f32(out_row_0, out00);
// out row 1
float32x4x2_t out10 = vld2q_f32(out_row_1);
out10.val[0] =
neon_vfma_lane_2(out10.val[0], in_vec, k0_vec);
out10.val[1] =
neon_vfma_lane_3(out10.val[1], in_vec, k0_vec);
vst2q_f32(out_row_1, out10);
in += 4;
out_row_0 += 8;
out_row_1 += 8;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 2; ++k) {
out_row_0[k] += val * k0[k];
out_row_1[k] += val * k0[k + 2];
}
in++;
out_row_0 += 2;
out_row_1 += 2;
}
}
}
}
}
}
} // namespace ops
} // namespace mace
......@@ -25,11 +25,9 @@
#include <vector>
#include "mace/core/future.h"
#include "mace/core/operator.h"
#include "mace/core/tensor.h"
#include "mace/ops/activation.h"
#include "mace/ops/arm/deconv_2d_neon.h"
#include "mace/ops/conv_pool_2d_util.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/deconv_2d.h"
......@@ -38,134 +36,6 @@
namespace mace {
namespace ops {
class Deconv2dOpBase : public Operation {
public:
explicit Deconv2dOpBase(OpConstructContext *context)
: Operation(context),
strides_(Operation::GetRepeatedArgs<int>("strides")),
padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))),
paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
model_type_(static_cast<ops::FrameworkType>(
Operation::GetOptionalArg<int>("framework_type", 0))),
activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
static void CalcDeconvOutputSize(
const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const int *strides,
index_t *output_shape,
const int *padding_size,
int *input_padding,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
MACE_CHECK_NOTNULL(input_shape);
MACE_CHECK_NOTNULL(filter_shape);
MACE_CHECK_NOTNULL(strides);
const index_t output_channel = filter_shape[0];
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t kernel_h = filter_shape[2];
const index_t kernel_w = filter_shape[3];
input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
input_padding[0] = std::max<int>(0, input_padding[0]);
input_padding[1] = std::max<int>(0, input_padding[1]);
index_t out_height =
(in_height - 1) * strides[0] + kernel_h - padding_size[0];
index_t out_width =
(in_width - 1) * strides[1] + kernel_w - padding_size[1];
output_shape[0] = input_shape[0];
if (isNCHW) {
output_shape[1] = output_channel;
output_shape[2] = out_height;
output_shape[3] = out_width;
} else {
output_shape[1] = out_height;
output_shape[2] = out_width;
output_shape[3] = output_channel;
}
}
static void CalcDeconvPaddingAndInputSize(
const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const int *strides,
Padding padding,
const index_t *output_shape,
int *padding_size,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
MACE_CHECK_NOTNULL(input_shape);
MACE_CHECK_NOTNULL(filter_shape);
MACE_CHECK_NOTNULL(strides);
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
const index_t extended_input_height = (in_height - 1) * strides[0] + 1;
const index_t extended_input_width = (in_width - 1) * strides[1] + 1;
const index_t filter_h = filter_shape[2];
const index_t filter_w = filter_shape[3];
index_t expected_input_height = 0, expected_input_width = 0;
switch (padding) {
case VALID:
expected_input_height =
(out_height - filter_h + strides[0]) / strides[0];
expected_input_width =
(out_width - filter_w + strides[1]) / strides[1];
break;
case SAME:
expected_input_height =
(out_height + strides[0] - 1) / strides[0];
expected_input_width =
(out_width + strides[1] - 1) / strides[1];
break;
default:
MACE_CHECK(false, "Unsupported padding type: ", padding);
}
MACE_CHECK(expected_input_height == in_height,
expected_input_height, "!=", in_height);
MACE_CHECK(expected_input_width == in_width,
expected_input_width, "!=", in_width);
const int p_h = static_cast<int>(out_height +
filter_h - 1 - extended_input_height);
const int p_w = static_cast<int>(out_width +
filter_w - 1 - extended_input_width);
padding_size[0] = std::max<int>(0, p_h);
padding_size[1] = std::max<int>(0, p_w);
}
protected:
std::vector<int> strides_; // [stride_h, stride_w]
const Padding padding_type_;
std::vector<int> paddings_;
const FrameworkType model_type_;
const ActivationType activation_;
const float relux_max_limit_;
};
template <DeviceType D, class T>
class Deconv2dOp;
......@@ -193,56 +63,65 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<int> paddings(2);
std::vector<int> out_paddings(2);
std::vector<index_t> output_shape(4);
std::vector<int> in_paddings(2, 0);
std::vector<int> out_paddings(2, 0);
std::vector<index_t> out_shape(4, 0);
std::vector<index_t> padded_out_shape(4, 0);
if (model_type_ == FrameworkType::TENSORFLOW) { // tensorflow
paddings = std::vector<int>(2, 0);
MACE_CHECK_NOTNULL(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4);
Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
auto output_shape_data =
output_shape_tensor->data<int32_t>();
output_shape =
out_shape =
std::vector<index_t>(output_shape_data, output_shape_data + 4);
const index_t t = output_shape[1];
output_shape[1] = output_shape[3];
output_shape[3] = output_shape[2];
output_shape[2] = t;
const index_t t = out_shape[1];
out_shape[1] = out_shape[3];
out_shape[3] = out_shape[2];
out_shape[2] = t;
CalcDeconvPaddingAndInputSize(
CalcDeconvShape_TF(
input->shape().data(),
filter->shape().data(),
strides_.data(), padding_type_,
output_shape.data(),
paddings.data(), true);
out_shape.data(),
strides_.data(),
1,
padding_type_,
in_paddings.data(),
out_paddings.data(),
padded_out_shape.data(),
true);
} else { // caffe
out_paddings = paddings_;
output_shape = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(),
filter->shape().data(),
strides_.data(),
output_shape.data(),
out_paddings.data(),
paddings.data(),
true);
if (!paddings_.empty()) out_paddings = paddings_;
CalcDeconvShape_Caffe(
input->shape().data(),
filter->shape().data(),
strides_.data(),
out_paddings.data(),
1,
in_paddings.data(),
out_shape.data(),
padded_out_shape.data(),
true);
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
output->Clear();
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
const index_t *in_shape = input->shape().data();
MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
output_shape[1]);
MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
out_shape[1]);
MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
in_shape[1]);
MACE_CHECK(in_shape[0] == output_shape[0],
MACE_CHECK(in_shape[0] == out_shape[0],
"Input/Output batch size mismatch");
std::function<void(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
const index_t *output_shape,
float *output)> deconv_func;
Tensor::MappingGuard input_mapper(input);
......@@ -254,13 +133,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
const index_t pad_h = out_paddings[0] / 2;
const index_t pad_w = out_paddings[1] / 2;
std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
padded_out_h, padded_out_w});
index_t padded_out_size =
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
......@@ -274,6 +149,11 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
padded_out.Clear();
auto *padded_out_data = padded_out.mutable_data<float>();
bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
strides_[0] == strides_[1] && strides_[0] == 1;
bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
strides_[0] == strides_[1] && strides_[0] == 2;
bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
strides_[0] == strides_[1] && strides_[0] == 1;
bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
......@@ -284,73 +164,98 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
strides_[0] == strides_[1] && strides_[0] == 2;
if (use_neon_3x3_s1) {
if (use_neon_2x2_s1) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *in_shape,
const index_t *padded_out_shape,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK2x2S1(input,
filter,
input_shape,
padded_output_shape,
padded_output);
};
} else if (use_neon_2x2_s2) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK2x2S2(input,
filter,
input_shape,
padded_output_shape,
padded_output);
};
} else if (use_neon_3x3_s1) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK3x3S1(input,
filter,
in_shape,
padded_out_shape,
input_shape,
padded_output_shape,
padded_output);
};
} else if (use_neon_3x3_s2) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *in_shape,
const index_t *padded_out_shape,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK3x3S2(input,
filter,
in_shape,
padded_out_shape,
input_shape,
padded_output_shape,
padded_output);
};
} else if (use_neon_4x4_s1) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *in_shape,
const index_t *padded_out_shape,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK4x4S1(input,
filter,
in_shape,
padded_out_shape,
input_shape,
padded_output_shape,
padded_output);
};
} else if (use_neon_4x4_s2) {
deconv_func = [=](const float *input,
const float *filter,
const index_t *in_shape,
const index_t *padded_out_shape,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dNeonK4x4S2(input,
filter,
in_shape,
padded_out_shape,
input_shape,
padded_output_shape,
padded_output);
};
} else {
deconv_func = [=](const float *input,
const float *filter,
const index_t *in_shape,
const index_t *padded_out_shape,
const index_t *input_shape,
const index_t *padded_output_shape,
float *padded_output) {
Deconv2dGeneral(input,
filter,
kernel_h,
kernel_w,
strides_.data(),
in_shape,
padded_out_shape,
input_shape,
padded_output_shape,
padded_output);
};
}
bool no_pad =
padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
(padded_out_shape[2] == out_shape[2]) &&
(padded_out_shape[3] == out_shape[3]);
float *out_data = no_pad ? output_data : padded_out_data;
deconv_func(input_data,
......@@ -361,16 +266,16 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
if (!no_pad) {
CropPadOut<float>(out_data,
padded_out_shape.data(),
output_shape.data(),
out_shape.data(),
pad_h,
pad_w,
output_data);
}
if (bias_data != nullptr) {
const index_t batch = output_shape[0];
const index_t channels = output_shape[1];
const index_t img_size = output_shape[2] * output_shape[3];
const index_t batch = out_shape[0];
const index_t channels = out_shape[1];
const index_t img_size = out_shape[2] * out_shape[3];
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
......@@ -476,39 +381,46 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<int> paddings(2);
std::vector<int> out_paddings(2);
std::vector<index_t> output_shape(4);
std::vector<int> in_paddings(2, 0);
std::vector<index_t> out_shape(4, 0);
if (model_type_ == FrameworkType::TENSORFLOW) {
paddings = std::vector<int>(2, 0);
MACE_CHECK_NOTNULL(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4);
Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
auto output_shape_data =
output_shape_tensor->data<int32_t>();
output_shape =
out_shape =
std::vector<index_t>(output_shape_data, output_shape_data + 4);
CalcDeconvPaddingAndInputSize(input->shape().data(),
filter->shape().data(),
strides_.data(),
padding_type_,
output_shape.data(),
paddings.data());
CalcDeconvShape_TF(
input->shape().data(),
filter->shape().data(),
out_shape.data(),
strides_.data(),
1,
padding_type_,
in_paddings.data(),
nullptr,
nullptr);
} else {
out_paddings = paddings_;
paddings = std::vector<int>(2, 0);
output_shape = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(),
filter->shape().data(),
strides_.data(),
output_shape.data(),
out_paddings.data(),
paddings.data());
std::vector<int> out_paddings(2, 0);
if (!paddings_.empty()) out_paddings = paddings_;
CalcDeconvShape_Caffe(
input->shape().data(),
filter->shape().data(),
strides_.data(),
out_paddings.data(),
1,
in_paddings.data(),
out_shape.data(),
nullptr);
}
return kernel_->Compute(context, input, filter, bias,
strides_.data(), paddings.data(), activation_,
relux_max_limit_, output_shape, output);
strides_.data(), in_paddings.data(), activation_,
relux_max_limit_, out_shape, output);
}
private:
......
......@@ -15,7 +15,14 @@
#ifndef MACE_OPS_DECONV_2D_H_
#define MACE_OPS_DECONV_2D_H_
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/operator.h"
#include "mace/core/types.h"
#include "mace/ops/activation.h"
#include "mace/ops/conv_pool_2d_util.h"
namespace mace {
namespace ops {
......@@ -25,6 +32,167 @@ enum FrameworkType {
CAFFE = 1,
};
class Deconv2dOpBase : public Operation {
public:
explicit Deconv2dOpBase(OpConstructContext *context)
: Operation(context),
strides_(Operation::GetRepeatedArgs<int>("strides")),
padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))),
paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
group_(Operation::GetOptionalArg<int>("group", 1)),
model_type_(static_cast<ops::FrameworkType>(
Operation::GetOptionalArg<int>("framework_type", 0))),
activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"))),
relux_max_limit_(
Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
static void CalcDeconvShape_Caffe(
const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const int *strides,
const int *out_paddings,
const int group,
int *in_paddings,
index_t *out_shape,
index_t *padded_out_shape,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(out_paddings);
MACE_CHECK_NOTNULL(input_shape);
MACE_CHECK_NOTNULL(filter_shape);
MACE_CHECK_NOTNULL(strides);
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t output_channel = filter_shape[0] * group;
const index_t kernel_h = filter_shape[2];
const index_t kernel_w = filter_shape[3];
index_t padded_out_height =
(in_height - 1) * strides[0] + kernel_h;
index_t padded_out_width =
(in_width - 1) * strides[1] + kernel_w;
if (in_paddings != nullptr) {
in_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - out_paddings[0]);
in_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - out_paddings[1]);
in_paddings[0] = std::max<int>(0, in_paddings[0]);
in_paddings[1] = std::max<int>(0, in_paddings[1]);
}
if (padded_out_shape != nullptr) {
padded_out_shape[0] = input_shape[0];
padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
}
if (out_shape != nullptr) {
index_t out_height = padded_out_height - out_paddings[0];
index_t out_width = padded_out_width - out_paddings[1];
out_shape[0] = input_shape[0];
out_shape[1] = isNCHW ? output_channel : out_height;
out_shape[2] = isNCHW ? out_height : out_width;
out_shape[3] = isNCHW ? out_width : output_channel;
}
}
static void CalcDeconvShape_TF(
const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const index_t *output_shape,
const int *strides,
const int group,
Padding padding_type,
int *in_paddings,
int *out_paddings,
index_t *padded_out_shape,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(input_shape);
MACE_CHECK_NOTNULL(filter_shape);
MACE_CHECK_NOTNULL(strides);
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
const index_t kernel_h = filter_shape[2];
const index_t kernel_w = filter_shape[3];
index_t expected_input_height = 0, expected_input_width = 0;
switch (padding_type) {
case VALID:
expected_input_height =
(out_height - kernel_h + strides[0]) / strides[0];
expected_input_width =
(out_width - kernel_w + strides[1]) / strides[1];
break;
case SAME:
expected_input_height =
(out_height + strides[0] - 1) / strides[0];
expected_input_width =
(out_width + strides[1] - 1) / strides[1];
break;
default:
MACE_CHECK(false, "Unsupported padding type: ", padding_type);
}
MACE_CHECK(expected_input_height == in_height,
expected_input_height, "!=", in_height);
MACE_CHECK(expected_input_width == in_width,
expected_input_width, "!=", in_width);
const index_t padded_out_height =
(in_height - 1) * strides[0] + kernel_h;
const index_t padded_out_width =
(in_width - 1) * strides[1] + kernel_w;
if (in_paddings != nullptr) {
const int p_h =
static_cast<int>(out_height + kernel_h - 1 - extended_in_height);
const int p_w =
static_cast<int>(out_width + kernel_w - 1 - extended_in_width);
in_paddings[0] = std::max<int>(0, p_h);
in_paddings[1] = std::max<int>(0, p_w);
}
if (out_paddings != nullptr) {
const int o_p_h = static_cast<int>(padded_out_height - out_height);
const int o_p_w = static_cast<int>(padded_out_width - out_width);
out_paddings[0] = std::max<int>(0, o_p_h);
out_paddings[1] = std::max<int>(0, o_p_w);
}
if (padded_out_shape != nullptr) {
index_t output_channel = filter_shape[0] * group;
padded_out_shape[0] = output_shape[0];
padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
}
}
protected:
std::vector<int> strides_; // [stride_h, stride_w]
const Padding padding_type_;
std::vector<int> paddings_;
const int group_;
const FrameworkType model_type_;
const ActivationType activation_;
const float relux_max_limit_;
};
template <typename T>
void CropPadOut(const T *input,
const index_t *in_shape,
......
......@@ -370,9 +370,9 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) {
namespace {
template <DeviceType D, typename T>
void TestComplexDeconvNxNS12(const int batch,
const std::vector<int> &shape,
const int stride) {
void TestComplexDeconvNxN(const int batch,
const std::vector<int> &shape,
const int stride) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type, int padding) {
......@@ -415,8 +415,6 @@ void TestComplexDeconvNxNS12(const int batch,
output_shape.push_back(output_channels);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
} else {
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
paddings.push_back(padding);
paddings.push_back(padding);
}
......@@ -497,38 +495,42 @@ void TestComplexDeconvNxNS12(const int batch,
1e-4);
};
for (int kernel_size : {3, 4, 5, 7}) {
func(kernel_size, kernel_size, stride, stride, VALID, -1);
func(kernel_size, kernel_size, stride, stride, SAME, -1);
func(kernel_size, kernel_size, stride, stride, VALID, 2);
func(kernel_size, kernel_size, stride, stride, VALID, 3);
for (int kernel_size : {2, 3, 4, 5, 7}) {
if (kernel_size >= stride) {
func(kernel_size, kernel_size, stride, stride, VALID, -1);
func(kernel_size, kernel_size, stride, stride, SAME, -1);
func(kernel_size, kernel_size, stride, stride, VALID, 1);
func(kernel_size, kernel_size, stride, stride, VALID, 2);
func(kernel_size, kernel_size, stride, stride, VALID, 3);
}
}
}
} // namespace
TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS12) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 2);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 1);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 2);
}
TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 3);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 4);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 3);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 4);
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 3);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 4);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 3);
TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 4);
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
TestComplexDeconvNxN<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
TestComplexDeconvNxN<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
}
} // namespace test
......
......@@ -24,11 +24,8 @@
#include <vector>
#include "mace/core/future.h"
#include "mace/core/operator.h"
#include "mace/core/tensor.h"
#include "mace/ops/activation.h"
#include "mace/ops/arm/depthwise_deconv2d_neon.h"
#include "mace/ops/conv_pool_2d_util.h"
#include "mace/utils/utils.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -38,90 +35,15 @@
namespace mace {
namespace ops {
class DepthwiseDeconv2dOpBase : public Operation {
public:
explicit DepthwiseDeconv2dOpBase(OpConstructContext *context)
: Operation(context),
strides_(Operation::GetRepeatedArgs<int>("strides")),
paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
group_(Operation::GetOptionalArg<int>("group", 1)),
activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
0.0f)) {}
static void CalcGroupDeconvOutputSize(
const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const int group,
const int *strides,
const int *paddings,
int *pre_paddings,
index_t *out_shape,
index_t *padded_out_shape,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(paddings);
MACE_CHECK_NOTNULL(input_shape);
MACE_CHECK_NOTNULL(filter_shape);
MACE_CHECK_NOTNULL(strides);
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t output_channel = filter_shape[0] * group;
const index_t kernel_h = filter_shape[2];
const index_t kernel_w = filter_shape[3];
index_t padded_out_height =
(in_height - 1) * strides[0] + kernel_h;
index_t padded_out_width =
(in_width - 1) * strides[1] + kernel_w;
if (pre_paddings != nullptr) {
pre_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - paddings[0]);
pre_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - paddings[1]);
pre_paddings[0] = std::max<int>(0, pre_paddings[0]);
pre_paddings[1] = std::max<int>(0, pre_paddings[1]);
}
if (padded_out_shape != nullptr) {
padded_out_shape[0] = input_shape[0];
padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
}
if (out_shape != nullptr) {
index_t out_height = padded_out_height - paddings[0];
index_t out_width = padded_out_width - paddings[1];
out_shape[0] = input_shape[0];
out_shape[1] = isNCHW ? output_channel : out_height;
out_shape[2] = isNCHW ? out_height : out_width;
out_shape[3] = isNCHW ? out_width : output_channel;
}
}
protected:
std::vector<int> strides_; // [stride_h, stride_w]
std::vector<int> paddings_;
const int group_;
const ActivationType activation_;
const float relux_max_limit_;
};
template <DeviceType D, class T>
class DepthwiseDeconv2dOp;
template<>
class DepthwiseDeconv2dOp<DeviceType::CPU, float>
: public DepthwiseDeconv2dOpBase {
: public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: DepthwiseDeconv2dOpBase(context) {}
: Deconv2dOpBase(context) {}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
......@@ -138,15 +60,17 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
std::vector<index_t> padded_out_shape(4, 0);
if (!paddings_.empty()) out_paddings = paddings_;
CalcGroupDeconvOutputSize(input->shape().data(),
filter->shape().data(),
group_,
strides_.data(),
out_paddings.data(),
nullptr,
out_shape.data(),
padded_out_shape.data(),
true);
CalcDeconvShape_Caffe(
input->shape().data(),
filter->shape().data(),
strides_.data(),
out_paddings.data(),
group_,
nullptr,
out_shape.data(),
padded_out_shape.data(),
true);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
output->Clear();
index_t kernel_h = filter->dim(2);
......@@ -480,10 +404,10 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public DepthwiseDeconv2dOpBase {
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: DepthwiseDeconv2dOpBase(context) {
: Deconv2dOpBase(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
} else {
......@@ -501,16 +425,18 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public DepthwiseDeconv2dOpBase {
MACE_CHECK_NOTNULL(output);
std::vector<int> in_paddings(2, 0);
std::vector<int> out_paddings(2, 0);
std::vector<index_t> out_shape(4, 0);
CalcGroupDeconvOutputSize(input->shape().data(),
filter->shape().data(),
group_,
strides_.data(),
paddings_.data(),
in_paddings.data(),
out_shape.data(),
nullptr);
if (!paddings_.empty()) out_paddings = paddings_;
CalcDeconvShape_Caffe(input->shape().data(),
filter->shape().data(),
strides_.data(),
out_paddings.data(),
group_,
in_paddings.data(),
out_shape.data(),
nullptr);
return kernel_->Compute(context,
input,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册