提交 ff5d176e 编写于 作者: L Liangliang He

Add depthwise conv2d opencl kernel

上级 0279acae
...@@ -21,15 +21,15 @@ struct Conv2dFunctorBase { ...@@ -21,15 +21,15 @@ struct Conv2dFunctorBase {
const float relux_max_limit, const float relux_max_limit,
const float prelu_alpha) const float prelu_alpha)
: strides_(strides), : strides_(strides),
dilations_(dilations),
paddings_(paddings), paddings_(paddings),
dilations_(dilations),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit), relux_max_limit_(relux_max_limit),
prelu_alpha_(prelu_alpha) {} prelu_alpha_(prelu_alpha) {}
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
const Padding paddings_; const Padding paddings_;
const int *dilations_; // [dilation_h, dilation_w]
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
const float prelu_alpha_; const float prelu_alpha_;
...@@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit, relux_max_limit,
prelu_alpha) {} prelu_alpha) {}
void operator()(const Tensor *input, void operator()(const Tensor *input, // NHWC
const Tensor *filter, const Tensor *filter, // HWIO
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
......
...@@ -2,28 +2,57 @@ ...@@ -2,28 +2,57 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_ #ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_DEPTHWISE_CONV_H_ #define MACE_KERNELS_DEPTHWISE_CONV2D_H_
#include "mace/core/future.h"
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/core/future.h"
#include "mace/core/public/mace.h" #include "mace/core/public/mace.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctorBase(const int *strides,
const Padding padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha)
: strides_(strides),
padding_(padding),
dilations_(dilations),
activation_(activation),
relux_max_limit_(relux_max_limit),
prelu_alpha_(prelu_alpha) {}
const int *strides_; // [stride_h, stride_w]
const Padding padding_;
const int *dilations_; // [dilation_h, dilation_w]
const ActivationType activation_;
const float relux_max_limit_;
const float prelu_alpha_;
};
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct DepthwiseConv2dFunctor { struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor() {}
DepthwiseConv2dFunctor(const int *strides, DepthwiseConv2dFunctor(const int *strides,
const std::vector<int> &paddings, const Padding padding,
const int *dilations) const int *dilations,
: strides_(strides), paddings_(paddings), dilations_(dilations) {} const ActivationType activation,
const float relux_max_limit,
void operator()(const Tensor *input, // NCHW const float prelu_alpha)
const Tensor *filter, // c_out, c_in, kernel_h, kernel_w : DepthwiseConv2dFunctorBase(strides,
const Tensor *bias, // c_out padding,
dilations,
activation,
relux_max_limit,
prelu_alpha) {}
void operator()(const Tensor *input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias, // O
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(input);
...@@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor { ...@@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor {
MACE_CHECK_NOTNULL(bias); MACE_CHECK_NOTNULL(bias);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->shape()[0];
fake_filter_shape[1] = filter->shape()[1];
fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3];
fake_filter_shape[2] = 1;
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
padding_, output_shape.data(), paddings.data());
auto input_shape = fake_filter_shape;
output->Resize(output_shape);
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t height = output->dim(1);
index_t height = output->dim(2); index_t width = output->dim(2);
index_t width = output->dim(3); index_t channels = output->dim(3);
index_t input_batch = input->dim(0); index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1); index_t input_height = input->dim(1);
index_t input_height = input->dim(2); index_t input_width = input->dim(2);
index_t input_width = input->dim(3); index_t input_channels = input->dim(3);
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(1);
index_t multiplier = filter->dim(3);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", input_channels);
MACE_CHECK(channels == input_channels * multiplier);
int stride_h = strides_[0]; int stride_h = strides_[0];
int stride_w = strides_[1]; int stride_w = strides_[1];
...@@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor { ...@@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings_[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2; index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2; index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
index_t kernel_size = kernel_h * kernel_w; const index_t kernel_size = kernel_h * kernel_w;
index_t multiplier = filter->dim(0);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
...@@ -72,15 +118,17 @@ struct DepthwiseConv2dFunctor { ...@@ -72,15 +118,17 @@ struct DepthwiseConv2dFunctor {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int n = 0; n < batch; ++n) { for (int n = 0; n < batch; ++n) {
for (int c = 0; c < channels; ++c) {
T bias_channel = bias_ptr ? bias_ptr[c] : 0;
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
index_t offset = n * channels * height * width + for (int c = 0; c < channels; ++c) {
c * height * width + h * width + w; const index_t inc = c / multiplier;
const index_t m = c % multiplier;
T bias_channel = bias_ptr ? bias_ptr[c] : 0;
index_t offset = n * height * width * channels +
h * width * channels + w * channels + c;
output_ptr[offset] = bias_channel; output_ptr[offset] = bias_channel;
T sum = 0; T sum = 0;
const T *filter_base = filter_ptr + c * kernel_size; const T *filter_base = filter_ptr + inc * multiplier + m;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh; int inh = padded_h_start + h * stride_h + dilation_h * kh;
...@@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor { ...@@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor {
"Out of range read from input: ", inh, ", ", inw); "Out of range read from input: ", inh, ", ", inw);
} else { } else {
index_t input_offset = index_t input_offset =
n * input_channels * input_height * input_width + n * input_height * input_width * input_channels +
(c / multiplier) * input_height * input_width + inh * input_width * input_channels +
inh * input_width + inw; inw * input_channels + inc;
sum += input_ptr[input_offset] * *filter_base; sum += input_ptr[input_offset] * filter_base[0]; // HWIM
} }
++filter_base; filter_base += input_channels * multiplier;
} }
} }
output_ptr[offset] += sum; output_ptr[offset] += sum;
...@@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor { ...@@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor {
} }
} }
} }
const int *strides_; // [stride_h, stride_w]
std::vector<int> paddings_; // [padding_h, padding_w]
const int *dilations_; // [dilation_h, dilation_w]
}; };
template <> template <>
...@@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
template <> template <typename T>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()( struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
const Tensor *input, : DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
const Padding padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha)
: DepthwiseConv2dFunctorBase(strides,
padding,
dilations,
activation,
relux_max_limit,
prelu_alpha) {}
void operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_DEPTHWISE_CONV_H_ #endif // MACE_KERNELS_DEPTHWISE_CONV2D_H_
OpenCL Image Storage Layout
===
Input/Output
---
Conv2D Filter
---
Depthwise Conv2D Filter
---
...@@ -27,9 +27,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -27,9 +27,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
string kernel_name; string kernel_name;
switch (type) { switch (type) {
case FILTER: case CONV2D_FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image"; kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break; break;
case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break;
case IN_OUT: case IN_OUT:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
break; break;
......
...@@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc ...@@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
} }
} }
__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int filter_w,
__private const int in_channel,
__private const int multiplier,
__write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */
const int w = get_global_id(0);
const int h = get_global_id(1);
DATA_TYPE4 values = 0;
if (multiplier == 1) {
const int in_channel_idx = h << 2;
const int h_idx = w / filter_w;
const int w_idx = w % filter_w;
const int offset = mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx);
const int size = in_channel - in_channel_idx;
if (in_channel_idx < in_channel) {
if (size < 4) {
switch(size) {
case 3:
values.z = *(input + offset + 2);
case 2:
values.y = *(input + offset + 1);
case 1:
values.x = *(input + offset);
}
} else {
values = vload4(0, input + offset);
}
}
} else {
const int in_channel_idx = h << 2;
const int m = w % multiplier;
const int hw_idx = w / multiplier;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx),
multiplier, m);
// TODO support multiplier > 1
}
int2 coord = (int2)(w, h);
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int height, __private const int height,
__private const int width, __private const int width,
......
#include <common.h> #include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif #endif
...@@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const int out_width, __private const int out_width,
__private const int filter_height, __private const int filter_height,
__private const int filter_width, __private const int filter_width,
__private const int stride,
__private const int padding_top, __private const int padding_top,
__private const int padding_left, __private const int padding_left,
__private const int dilation_h, __private const int dilation_h,
...@@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
DATA_TYPE4 out3 = 0; DATA_TYPE4 out3 = 0;
#endif #endif
#if STRIDE == 1 int in_width_stride = mul24(out_w_blks, stride);
int in_width0 = out_w_blk - padding_left; int in_width0 = mad24(out_w_blk, stride, -padding_left);
int in_width1 = in_width0 + out_w_blks; int in_width1 = in_width0 + in_width_stride;
int in_width2 = in_width1 + out_w_blks; int in_width2 = in_width1 + in_width_stride;
int in_width3 = in_width2 + out_w_blks; int in_width3 = in_width2 + in_width_stride;
const int height_idx = (out_hb % out_height) - padding_top; const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
#else
int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
const int height_idx = ((out_hb % out_height) << 1) - padding_top;
#endif
const int batch_idx = mul24((out_hb / out_height), in_height); const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
...@@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
// TODO (heliangliang) optimize out these muls
int in_hb_value = height_idx + mul24(hb_idx, dilation_h); int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
......
...@@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.z = w.y + out_w_blks; w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks; w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height); int out_hb_idx = (out_hb % height);
#else #elif STRIDE == 2
w.x = out_w_blk << 1; w.x = out_w_blk << 1;
w.y = (out_w_blk + out_w_blks) << 1; w.y = (out_w_blk + out_w_blks) << 1;
w.z = (out_w_blk + (out_w_blks << 1)) << 1; w.z = (out_w_blk + (out_w_blks << 1)) << 1;
......
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif #endif
...@@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int in_width3 = in_width2 + out_w_blks; int in_width3 = in_width2 + out_w_blks;
int in_width4 = in_width3 + out_w_blks; int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top; const int height_idx = (out_hb % out_height) - padding_top;
#else #elif STRIDE == 2
int in_width0 = (out_w_blk << 1) - padding_left; int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
...@@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
// TODO (heliangliang) optimize out these muls
int in_hb_value = height_idx + mul24(hb_idx, dilation_h); int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
......
#include <common.h>
// Only multiplier = 1 is supported
__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const DATA_TYPE relux_max_limit,
__private const DATA_TYPE prelu_alpha,
__private const short in_height,
__private const short in_width,
__private const short in_ch_blks,
__private const short out_height,
__private const short out_width,
__private const short filter_height,
__private const short filter_width,
__private const short padding_top,
__private const short padding_left,
__private const short dilation_h,
__private const short dilation_w) {
const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1);
const short out_w_blks = get_global_size(1);
const short out_hb = get_global_id(2);
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
#ifdef BIAS
DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
const short out_h = out_hb % out_height;
#if STRIDE == 1
const short in_width0 = out_w_blk - padding_left;
const short in_width1 = in_width0 + out_w_blks;
const short in_width2 = in_width1 + out_w_blks;
const short in_width3 = in_width2 + out_w_blks;
const short height_idx = out_h - padding_top;
#elif STRIDE == 2
int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
const int height_idx = (out_h << 1) - padding_top;
#else
const short in_width_stride = mul24(out_w_blks, STRIDE);
const short in_width0 = mad24(out_w_blk, STRIDE, -padding_left);
const short in_width1 = in_width0 + in_width_stride;
const short in_width2 = in_width1 + in_width_stride;
const short in_width3 = in_width2 + in_width_stride;
const short height_idx = mad24(out_h, STRIDE, -padding_top);
#endif
const short batch_idx = mul24((out_hb / out_height), in_height);
const short rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
const short in_idx = mul24(in_ch_blk, in_width);
short filter_idx = 0;
short in_hb_idx = height_idx;
for (short filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
short in_hb = select(in_hb_idx + batch_idx,
-1,
(in_hb_idx < 0 || in_hb_idx >= in_height));
short in_w_idx = 0;
for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
short in_w;
DATA_TYPE4 in0, in1, in2, in3;
#define READ_INPUT(i) \
in_w = in_w_idx + in_width##i; \
in_w = select(in_idx + in_w, \
-1, \
(in_w < 0 || in_w >= in_width)); \
in##i = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
#undef READ_INPUT
DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER,
(int2)(filter_idx, in_ch_blk));
out0 = mad(in0, weights, out0);
out1 = mad(in1, weights, out1);
out2 = mad(in2, weights, out2);
out3 = mad(in3, weights, out3);
++filter_idx;
in_w_idx += dilation_w;
}
in_hb_idx += dilation_h;
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
out0 = do_activation(out0, relux_max_limit, prelu_alpha);
out1 = do_activation(out1, relux_max_limit, prelu_alpha);
out2 = do_activation(out2, relux_max_limit, prelu_alpha);
out3 = do_activation(out3, relux_max_limit, prelu_alpha);
#endif
const short out_x_base = mul24(out_ch_blk, out_width);
short w = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
#include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
// Supported data type: half/float
DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr,
const int row_width) {
VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
//TODO merge the depthwise with conv 3x3 to remove duplicate code.
__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */
__global const DATA_TYPE *filter, /* m, i, kh, kw */
#ifdef BIAS
__global const DATA_TYPE *bias, /* o */
#endif
__global DATA_TYPE *output, /* n, c, h, w */
__private const int in_chan_num,
__private const int out_chan_num,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int multiplier = out_chan_num / in_chan_num;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
#ifdef STRIDE_1
const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
#else
const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4;
#endif
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel;
const DATA_TYPE *filter_ptr = filter + i * 9;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
#ifdef BIAS
VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i];
#else
VEC_DATA_TYPE(DATA_TYPE,4) res = 0;
#endif /* defined(BIAS) */
#ifdef STRIDE_1
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
#else
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
#endif
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
#ifdef BIAS
DATA_TYPE res = bias[i];
#else
DATA_TYPE res = 0;
#endif
res += conv3x3(input_ptr, filter_ptr, in_width);
output_ptr[p] = res;
#ifdef STRIDE_1
input_ptr += 1;
#else
input_ptr += 2;
#endif
}
}
}
}
...@@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input, ...@@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input,
*(static_cast<const cl::Image2D *>(output->buffer()))); *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, relux_max_limit); conv_2d_kernel.setArg(idx++, relux_max_limit);
conv_2d_kernel.setArg(idx++, prelu_alpha); conv_2d_kernel.setArg(idx++, prelu_alpha);
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2))); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height)); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(width));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0))); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(0)));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1))); conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(stride));
conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2);
conv_2d_kernel.setArg(idx++, dilations[0]); conv_2d_kernel.setArg(idx++, dilations[0]);
......
...@@ -2,60 +2,164 @@ ...@@ -2,60 +2,164 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/depthwise_conv2d.h" #include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter, void DepthwiseConv2d(const Tensor *input, // NHWC
const Tensor *bias, Tensor *output, const Tensor *filter, // HWIM
StatsFuture *future); const Tensor *bias,
const int stride,
const int *paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_channels = input->dim(3);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
const index_t multiplier = filter->dim(3);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
defeult:
LOG(FATAL) << "Unknown activation type: " << activation;
}
auto dw_conv2d_kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter, uint32_t idx = 0;
const Tensor *bias, Tensor *output, dw_conv2d_kernel.setArg(idx++,
StatsFuture *future); *(static_cast<const cl::Image2D *>(input->buffer())));
template <> dw_conv2d_kernel.setArg(
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
if (bias != nullptr) {
dw_conv2d_kernel.setArg(
idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
dw_conv2d_kernel.setArg(
idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
dw_conv2d_kernel.setArg(idx++, relux_max_limit);
dw_conv2d_kernel.setArg(idx++, prelu_alpha);
dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_height));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_width));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_channel_blocks));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(height));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(width));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_height));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_width));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[0] / 2));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[1] / 2));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[0]));
dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[1]));
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier);
TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future);
}
template <typename T>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input,
const Tensor *bias, Tensor *output, const Tensor *filter, const Tensor *bias,
StatsFuture *future); Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{nullptr, nullptr},
{nullptr, nullptr},
{DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || if (strides_[0] != strides_[1]) {
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer // TODO(heliangliang) The CPU/NEON kernel should map the buffer
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)( DepthwiseConv2dFunctor<DeviceType::CPU, float>(
input, filter, bias, output, future); strides_, padding_, dilations_, activation_, relux_max_limit_,
prelu_alpha_)(input, filter, bias, output, future);
return; return;
} }
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; // Create a fake conv_2d filter to calculate the paddings and output size
if (paddings_[0] > 0 || paddings_[1] > 0) { std::vector<index_t> fake_filter_shape(4);
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v()); fake_filter_shape[0] = filter->shape()[0];
ConstructInputWithPadding(input, paddings_.data(), &padded_input); fake_filter_shape[1] = filter->shape()[1];
conv2d_func(&padded_input, filter, bias, output, future); fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3];
}else { fake_filter_shape[2] = 1;
conv2d_func(input, filter, bias, output, future);
} std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
padding_, output_shape.data(), paddings.data());
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, prelu_alpha_,
DataTypeToEnum<T>::value, output, future);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const uint32_t stride,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_batch = input->dim(0);
const index_t input_channels = input->dim(1);
const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3);
MACE_CHECK(input_batch == batch);
const index_t pixels = height * width;
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv_3x3");
built_options.emplace("-Ddepthwise_conv_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", kernel_name, built_options);
uint32_t idx = 0;
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias != nullptr) {
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
}
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
}
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future);
};
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future);
};
} // namespace kernels
} // namespace mace
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
// [(c+3)/4*W, N * H] // [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
...@@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
image_shape[1] = shape[0] * shape[1]; image_shape[1] = shape[0] * shape[1];
} }
// [H * W * RoundUp<4>(Ic), (Oc + 3) / 4] // [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIO */
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[2], 4); image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[2], 4);
image_shape[1] = RoundUpDiv4(shape.back()); image_shape[1] = RoundUpDiv4(shape[3]);
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3];
image_shape[1] = RoundUpDiv4(shape[2]);
} }
// [(size + 3) / 4, 1] // [(size + 3) / 4, 1]
...@@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
switch (type) { switch (type) {
case FILTER:CalFilterImageShape(shape, image_shape); case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape);
break;
case DW_CONV2D_FILTER:
CalDepthwiseConv2dFilterImageShape(shape, image_shape);
break; break;
case IN_OUT:CalInOutputImageShape(shape, image_shape); case IN_OUT:
CalInOutputImageShape(shape, image_shape);
break; break;
case ARGUMENT:CalArgImageShape(shape, image_shape); case ARGUMENT:
CalArgImageShape(shape, image_shape);
break; break;
default:LOG(FATAL) << "Mace not supported yet."; default:LOG(FATAL) << "Mace not supported yet.";
} }
......
...@@ -17,9 +17,10 @@ namespace kernels { ...@@ -17,9 +17,10 @@ namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType { enum BufferType {
FILTER = 0, CONV2D_FILTER = 0,
IN_OUT = 1, DW_CONV2D_FILTER = 1,
ARGUMENT = 2 IN_OUT = 2,
ARGUMENT = 3
}; };
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
......
...@@ -20,7 +20,7 @@ class BufferToImageOp: public Operator<D, T> { ...@@ -20,7 +20,7 @@ class BufferToImageOp: public Operator<D, T> {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>( kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER))); "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output, future); functor_(const_cast<Tensor *>(input_tensor), type, output, future);
......
...@@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) { ...@@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) {
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 3, 5}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5});
} }
TEST(BufferToImageTest, Filter1x1Media) { TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 13, 17}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17});
} }
TEST(BufferToImageTest, Filter1x1Large) { TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 128, 512}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512});
} }
TEST(BufferToImageTest, Filter3x3Small) { TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 3, 5}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5});
} }
TEST(BufferToImageTest, Filter3x3Meida) { TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 13, 17}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17});
} }
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256});
} }
template<DeviceType D, typename T> template<DeviceType D, typename T>
......
...@@ -36,7 +36,7 @@ static void Conv2d(int iters, ...@@ -36,7 +36,7 @@ static void Conv2d(int iters,
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
......
...@@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() { ...@@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -444,7 +444,7 @@ void TestConv1x1() { ...@@ -444,7 +444,7 @@ void TestConv1x1() {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", BufferToImage<D, float>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", BufferToImage<D, float>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
BufferToImage<D, half>(net, "Input", "InputImage", BufferToImage<D, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", BufferToImage<D, half>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
......
...@@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) {
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", kernels::BufferType::FILTER) .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
.AddIntArg("mode", static_cast<int>(NetMode::INIT)) .AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
...@@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input("B2IOutput") .Input("B2IOutput")
.Output("Output") .Output("Output")
.AddIntArg("buffer_type", kernels::BufferType::FILTER) .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
NetDef net_def; NetDef net_def;
......
...@@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { ...@@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, float>); DepthwiseConv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, half>);
} }
} // namespace mace } // namespace mace
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#ifndef MACE_OPS_DEPTHWISE_CONV_H_ #ifndef MACE_OPS_DEPTHWISE_CONV2D_H_
#define MACE_OPS_DEPTHWISE_CONV_H_ #define MACE_OPS_DEPTHWISE_CONV2D_H_
#include <memory> #include <memory>
...@@ -18,10 +18,13 @@ template <DeviceType D, typename T> ...@@ -18,10 +18,13 @@ template <DeviceType D, typename T>
class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> { class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
public: public:
DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws) { : ConvPool2dOpBase<D, T>(op_def, ws),
functor_.strides_ = this->strides_.data(); functor_(this->strides_.data(),
functor_.dilations_ = this->dilations_.data(); this->padding_,
} this->dilations_.data(),
kernels::ActivationType::NOOP,
0.0f,
0.0f) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
bias = this->Input(BIAS); bias = this->Input(BIAS);
} }
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
// resize filter shape.
std::vector<index_t> filter_shape(filter->shape().begin(),
filter->shape().end());
filter_shape[0] *= filter_shape[1];
filter_shape[1] = 1;
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input, filter, bias, output, future); functor_(input, filter, bias, output, future);
return true; return true;
} }
...@@ -61,4 +48,4 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -61,4 +48,4 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
} // namespace mace } // namespace mace
#endif // MACE_OPS_DEPTHWISE_CONV_H_ #endif // MACE_OPS_DEPTHWISE_CONV2D_H_
...@@ -7,13 +7,48 @@ ...@@ -7,13 +7,48 @@
using namespace mace; using namespace mace;
namespace {
class DepthwiseConv2dOpTest : public OpsTestBase {}; class DepthwiseConv2dOpTest : public OpsTestBase {};
template <DeviceType D> template <DeviceType D, typename T>
void SimpleValidTest() { void SimpleValidTest() {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
net.AddInputFromArray<D, float>(
"Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
...@@ -22,30 +57,150 @@ void SimpleValidTest() { ...@@ -22,30 +57,150 @@ void SimpleValidTest() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<T>({1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f,
67.1f, 268.2f, 77.1f, 308.2f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
SimpleValidTest<DeviceType::CPU, float>();
}
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
SimpleValidTest<DeviceType::OPENCL, float>();
}
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
SimpleValidTest<DeviceType::OPENCL, half>();
}
template <DeviceType D, typename T>
void ComplexValidTest() {
testing::internal::LogToStderr();
// Construct graph
OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 2, 2, 3},
{1, 3, 5, 7, 9, 11, 2, 4, 6, 8, 10, 12});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {2, 2, 2, 2}, "Input", {1, 10, 10, 3},
{1.0f, 5.0f, 9.0f, 13.0f, 2.0f, 6.0f, 10.0f, 14.0f, 3.0f, 7.0f, 11.0f, {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11,
15.0f, 4.0f, 8.0f, 12.0f, 16.0f}); 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23,
net.AddInputFromArray<D, float>("Bias", {4}, {.1f, .2f, .3f, .4f}); 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35,
0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47,
0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71,
0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83,
0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95,
0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07,
1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19,
1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31,
1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43,
1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55,
1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67,
1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79,
1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91,
1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03,
2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15,
2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27,
2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39,
2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51,
2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63,
2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75,
2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87,
2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99});
net.AddInputFromArray<D, float>(
"Filter", {5, 5, 3, 1},
{0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21,
0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32,
0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43,
0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54,
0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,
0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74});
net.AddInputFromArray<D, float>("Bias", {6},
{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
}
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<T>(
{1, 4, 1, 2}, {1, 5, 5, 3},
{196.1f, 252.1f, 216.2f, 280.2f, 272.3f, 344.3f, 296.4f, 376.4f}); {4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976,
6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988,
7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977,
8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103,
11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985,
12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036,
13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741,
17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947,
18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998,
17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046,
23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029,
23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996,
6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024,
9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968,
9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.2);
} }
TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { SimpleValidTest<DeviceType::CPU>(); } TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
ComplexValidTest<DeviceType::CPU, float>();
}
TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
ComplexValidTest<DeviceType::OPENCL, float>();
}
template <DeviceType D> TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
ComplexValidTest<DeviceType::OPENCL, half>();
}
template <DeviceType D, typename T>
void TestNxNS12(const index_t height, const index_t width) { void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
...@@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) {
srand(time(NULL)); srand(time(NULL));
// generate random input // generate random input
index_t batch = 1; index_t batch = 1 + rand() % 5;
index_t input_channels = 3; index_t input_channels = 3 + rand() % 16;
index_t multiplier = 2; index_t multiplier = 1;
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
...@@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) {
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Run on cpu
net.AddRandomInput<D, float>("Input", net.RunOp();
{batch, input_channels, height, width});
net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
// Run on device
net.RunOp(D);
// Check // Check
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run cpu if (D == DeviceType::OPENCL) {
net.RunOp(); BufferToImage<D, T>(net, "Input", "InputImage",
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-3); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("DeviceOutput")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.1);
}; };
for (int kernel_size : {3}) { for (int kernel_size : {3}) {
...@@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) {
#if __ARM_NEON #if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) { TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) {
TestNxNS12<DeviceType::NEON>(4, 4); TestNxNS12<DeviceType::NEON, float>(4, 4);
} }
#endif #endif
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
TestNxNS12<DeviceType::OPENCL>(4, 4); TestNxNS12<DeviceType::OPENCL, float>(4, 4);
}
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(4, 4);
} }
#if __ARM_NEON #if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) { TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) {
TestNxNS12<DeviceType::NEON>(64, 64); TestNxNS12<DeviceType::NEON, float>(64, 64);
TestNxNS12<DeviceType::NEON>(128, 128); TestNxNS12<DeviceType::NEON, float>(128, 128);
} }
#endif #endif
TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) { TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(64, 64); TestNxNS12<DeviceType::OPENCL, float>(64, 64);
TestNxNS12<DeviceType::OPENCL>(128, 128); TestNxNS12<DeviceType::OPENCL, float>(128, 128);
}
TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(64, 64);
TestNxNS12<DeviceType::OPENCL, half>(128, 128);
} }
#if __ARM_NEON #if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) { TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) {
TestNxNS12<DeviceType::NEON>(107, 113); TestNxNS12<DeviceType::NEON, float>(107, 113);
} }
#endif #endif
TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) { TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(107, 113); TestNxNS12<DeviceType::OPENCL, float>(107, 113);
} }
TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(107, 113);
}
} // namespace
...@@ -14,17 +14,42 @@ namespace mace { ...@@ -14,17 +14,42 @@ namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void DepthwiseConv2d(int iters, static void DepthwiseConv2d(int iters,
int batch, int batch,
int channels, int input_channels,
int height, int height,
int width, int width,
int kernel_h, int kernel_h,
int kernel_w, int kernel_w,
int stride, int stride,
Padding padding, Padding padding,
int output_channels) { int multiplier) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
...@@ -33,25 +58,21 @@ static void DepthwiseConv2d(int iters, ...@@ -33,25 +58,21 @@ static void DepthwiseConv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
}
// Add input data
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels * channels});
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 2; ++i) {
net.RunOp(D); net.RunOp(D);
}
net.Sync(); net.Sync();
}
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
}
net.Sync(); net.Sync();
}
} }
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
...@@ -72,16 +93,17 @@ static void DepthwiseConv2d(int iters, ...@@ -72,16 +93,17 @@ static void DepthwiseConv2d(int iters,
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 2, float); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float);
} // namespace mace } // namespace mace
...@@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
...@@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
...@@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -220,7 +220,7 @@ void TestConv1x1() { ...@@ -220,7 +220,7 @@ void TestConv1x1() {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", BufferToImage<D, float>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", BufferToImage<D, float>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage<D, half>(net, "Input", "InputImage", BufferToImage<D, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", BufferToImage<D, half>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape, ...@@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat ...@@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
...@@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on gpu // run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
......
...@@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator<D, T> { ...@@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>( kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER))); "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor), future); functor_(output, type, const_cast<Tensor *>(input_tensor), future);
return true; return true;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册