提交 336f35b4 编写于 作者: L liuqi

Finish atrous convolution: faster than space_to_batch method.

上级 41a905c9
...@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
img_format, img_format,
image_shape[0], image_shape[1], image_shape[0], image_shape[1],
0, nullptr, &error); 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1]
<< "]";
return cl_image; return cl_image;
} }
......
...@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name, ...@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*program = cl::Program(this->context(), {device()}, {binary}); *program = cl::Program(this->context(), {device()}, {binary});
#else #else
std::string source_filename = kernel_path_ + program_file_name; std::string source_filename = kernel_path_ + program_file_name;
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
// Create program // Create program
bool is_binary_filename_exist = std::ifstream(binary_filename).is_open(); if (std::ifstream(source_filename).is_open()) {
if (is_binary_filename_exist) {
VLOG(1) << "Create program with binary: " << binary_filename;
std::vector<unsigned char> binary;
MACE_CHECK(ReadFile(binary_filename, true, &binary));
*program = cl::Program(this->context(), {device()}, {binary});
} else if (std::ifstream(source_filename).is_open()) {
VLOG(1) << "Create program with source: " << source_filename; VLOG(1) << "Create program with source: " << source_filename;
std::vector<unsigned char> kernel_source; std::vector<unsigned char> kernel_source;
MACE_CHECK(ReadFile(source_filename, false, &kernel_source)); MACE_CHECK(ReadFile(source_filename, false, &kernel_source));
...@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name, ...@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*program = cl::Program(this->context(), sources); *program = cl::Program(this->context(), sources);
} else { } else {
LOG(FATAL) << "Failed to open kernel file " << binary_filename << " or " LOG(FATAL) << "Failed to open kernel file " << source_filename;
<< source_filename;
} }
#endif #endif
...@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name, ...@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
#ifndef MACE_EMBED_BINARY_PROGRAM #ifndef MACE_EMBED_BINARY_PROGRAM
// Write binary if necessary // Write binary if necessary
if (!is_binary_filename_exist) { std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1; size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes( std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]); new size_t[device_list_size]);
cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES, cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * device_list_size, sizeof(size_t) * device_list_size,
program_binary_sizes.get(), nullptr); program_binary_sizes.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err; MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries( std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[device_list_size]); new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) { for (cl_uint i = 0; i < device_list_size; ++i) {
program_binaries[i] = std::unique_ptr<unsigned char[]>( program_binaries[i] = std::unique_ptr<unsigned char[]>(
new unsigned char[program_binary_sizes[i]]); new unsigned char[program_binary_sizes[i]]);
} }
err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES, err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
sizeof(unsigned char *) * device_list_size, sizeof(unsigned char *) * device_list_size,
program_binaries.get(), nullptr); program_binaries.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err; MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::vector<unsigned char> content( std::vector<unsigned char> content(
reinterpret_cast<unsigned char const *>(program_binaries[0].get()), reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) + reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
program_binary_sizes[0]); program_binary_sizes[0]);
MACE_CHECK(WriteFile(binary_filename, true, content)); MACE_CHECK(WriteFile(binary_filename, true, content));
}
#endif #endif
} }
......
...@@ -212,12 +212,14 @@ int main(int argc, char **argv) { ...@@ -212,12 +212,14 @@ int main(int argc, char **argv) {
LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us"; LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
} }
MACE_CHECK(engine.Run(input_data.get(), input_shape_vec, output_data.get()));
if (output_data != nullptr) { if (output_data != nullptr) {
ofstream out_file(output_file, ios::binary); ofstream out_file(output_file, ios::binary);
out_file.write((const char *) (output_data.get()), out_file.write((const char *) (output_data.get()),
output_size * sizeof(float)); output_size * sizeof(float));
out_file.flush(); out_file.flush();
out_file.close(); out_file.close();
LOG(INFO) << "Write output file done.";
} else {
LOG(ERROR) << "output data is null";
} }
} }
...@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const int filter_height, __private const int filter_height,
__private const int filter_width, __private const int filter_width,
__private const int padding_top, __private const int padding_top,
__private const int padding_left) { __private const int padding_left,
__private const int dilation_h,
__private const int dilation_w) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
...@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx; int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
...@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
for (short width_idx = 0; width_idx < filter_width; ++width_idx) { for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + mul24(width_idx, dilation_w); \
in_width_value = select(in_idx + in_width_value, \ in_width_value = select(in_idx + in_width_value, \
-1, \ -1, \
(in_width_value < 0 || in_width_value >= in_width)); \ (in_width_value < 0 || in_width_value >= in_width)); \
......
...@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width,
__private const int padding_top, __private const int padding_top,
__private const int padding_left) { __private const int padding_left,
__private const int dilation_h,
__private const int dilation_w) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
...@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
DATA_TYPE4 in0, in1, in2, in3, in4; DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3; DATA_TYPE4 weights0, weights1, weights2, weights3;
int hb_idx, width_idx, in_width_idx;
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx; int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
...@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
for (short width_idx = 0; width_idx < 3; ++width_idx) { for (short width_idx = 0; width_idx < 3; ++width_idx) {
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + mul24(width_idx, dilation_w); \
in_width_value = select(in_idx + in_width_value, \ in_width_value = select(in_idx + in_width_value, \
-1, \ -1, \
(in_width_value < 0 || in_width_value >= in_width)); \ (in_width_value < 0 || in_width_value >= in_width)); \
......
...@@ -10,29 +10,33 @@ namespace kernels { ...@@ -10,29 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, StatsFuture *future); const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, StatsFuture *future); const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, StatsFuture *future); const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, StatsFuture *future); const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding, const uint32_t stride, const int *padding,
const DataType dt, Tensor *output, const int *dilations, const DataType dt,
StatsFuture *future); Tensor *output, StatsFuture *future);
template<typename T> template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, const DataType dt, Tensor *output,
StatsFuture *future); StatsFuture *future);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = { static const Conv2dOpenclFunction selector[5][2] = {
...@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] || if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) { (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) { selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(), conv2d_func(input, filter, bias, false, paddings.data(), dilations_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} else { } else {
Conv2dOpencl(input, filter, bias, false, strides_[0], Conv2dOpencl(input, filter, bias, false, strides_[0],
paddings.data(), DataTypeToEnum<T>::value, paddings.data(), dilations_, DataTypeToEnum<T>::value,
output, future); output, future);
} }
......
...@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, ...@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *bias, const Tensor *bias,
const bool fused_relu, const bool fused_relu,
const int *padding, const int *padding,
const int *dilations,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input, ...@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *bias, const Tensor *bias,
const bool fused_relu, const bool fused_relu,
const int *padding, const int *padding,
const int *dilations,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
......
...@@ -15,8 +15,8 @@ namespace kernels { ...@@ -15,8 +15,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding, const uint32_t stride, const int *padding,
const DataType dt, Tensor *output, const int *dilations, const DataType dt,
StatsFuture *future) { Tensor *output, StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2);
conv_2d_kernel.setArg(idx++, dilations[0]);
conv_2d_kernel.setArg(idx++, dilations[1]);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
...@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input, ...@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *bias, const Tensor *bias,
const bool fused_relu, const bool fused_relu,
const int *padding, const int *padding,
const int *dilations,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future); Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dilations, dt, output, future);
}; };
void Conv2dOpenclK3x3S2(const Tensor *input, void Conv2dOpenclK3x3S2(const Tensor *input,
...@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input, ...@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
const Tensor *bias, const Tensor *bias,
const bool fused_relu, const bool fused_relu,
const int *padding, const int *padding,
const int *dilations,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future); Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dilations, dt, output, future);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -15,8 +15,8 @@ namespace kernels { ...@@ -15,8 +15,8 @@ namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter, void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding, const uint32_t stride, const int *padding,
const DataType dt, Tensor *output, const int *dilations, const DataType dt,
StatsFuture *future) { Tensor *output, StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, ...@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1))); conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2);
conv_2d_kernel.setArg(idx++, dilations[0]);
conv_2d_kernel.setArg(idx++, dilations[1]);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
......
...@@ -10,33 +10,33 @@ namespace kernels { ...@@ -10,33 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, const DataType dt, Tensor *output,
StatsFuture *future); StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, const DataType dt, Tensor *output,
StatsFuture *future); StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, const DataType dt, Tensor *output,
StatsFuture *future); StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, const DataType dt, Tensor *output,
StatsFuture *future); StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding, const uint32_t stride, const int *padding,
const DataType dt, Tensor *output, const int *dilations, const DataType dt,
StatsFuture *future); Tensor *output, StatsFuture *future);
template<typename T> template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt, const int *padding, const int *dilations,
Tensor *output, StatsFuture *future); const DataType dt, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = { static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
...@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{nullptr, nullptr}}; {nullptr, nullptr}};
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] || if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) { (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) { selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(), conv2d_func(input, filter, bias, true, paddings.data(), dilations_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} else { } else {
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
DataTypeToEnum<T>::value, output, future); dilations_, DataTypeToEnum<T>::value, output, future);
} }
} }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <fstream>
#include "mace/ops/conv_2d.h" #include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { ...@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
template<DeviceType D> template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape) { const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
srand(time(NULL)); srand(time(NULL));
auto func = [&](int stride_h, int stride_w, Padding padding) { auto func = [&](int stride_h, int stride_w, Padding padding) {
// generate random input // generate random input
index_t batch = 3 + (rand() % 10); index_t batch = 3;
index_t height = input_shape[0]; index_t height = input_shape[0];
index_t width = input_shape[1]; index_t width = input_shape[1];
index_t kernel_h = filter_shape[0]; index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1]; index_t kernel_w = filter_shape[1];
index_t input_channels = filter_shape[2] + (rand() % 10); index_t input_channels = filter_shape[2];
index_t output_channels = filter_shape[3] + (rand() % 10); index_t output_channels = filter_shape[3];
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.Output("Output") .Output("Output")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
std::vector<float> float_input_data; std::vector<float> float_input_data;
...@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
...@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}; };
for (int stride : {1, 2}) { func(1, 1, VALID);
func(stride, stride, VALID); func(1, 1, SAME);
func(stride, stride, SAME); if (dilations[0] == 1) {
func(2, 2, VALID);
func(2, 2, SAME);
} }
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 1, 32, 64}); {1, 1, 32, 64},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{3, 3, 32, 64}); {3, 3, 32, 64},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{15, 1, 256, 2}); {15, 1, 256, 2},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 15, 256, 2}); {1, 15, 256, 2},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 64}); {7, 7, 3, 64},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{1, 1, 5, 7}); {1, 1, 5, 7},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{3, 3, 5, 7}); {3, 3, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{5, 5, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{7, 7, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
{7, 7, 16, 16},
{4, 4});
}
template<DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
} }
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
2);
}
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
4);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
4);
}
...@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) { ...@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
{15, 1, 32, 64}); {15, 1, 32, 64});
} }
template<DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
}
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
}
TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
}
template<DeviceType D>
static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t input_channels = filter_shape[2];
index_t output_channels = filter_shape[3];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
};
func(1, 1, VALID);
func(1, 1, SAME);
}
TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 16},
{2, 2});
}
TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
{15, 15, 16, 16},
{2, 2});
}
...@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> { ...@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<EXP_TYPE>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<RES_TYPE>(); auto b = y.data<RES_TYPE>();
for (int n = 0; n < x.dim(0); ++n) { if (x.dim_size() == 4) {
for (int h = 0; h < x.dim(1); ++h) { for (int n = 0; n < x.dim(0); ++n) {
for (int w = 0; w < x.dim(2); ++w) { for (int h = 0; h < x.dim(1); ++h) {
for (int c = 0; c < x.dim(3); ++c) { for (int w = 0; w < x.dim(2); ++w) {
EXPECT_NEAR(*a, *b, abs_err) << "with index = [" for (int c = 0; c < x.dim(3); ++c) {
<< n << ", " << h << ", " EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
<< w << ", " << c << "]"; << n << ", " << h << ", "
a++; << w << ", " << c << "]";
b++; a++;
b++;
}
} }
} }
} }
} else {
for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i;
}
} }
} }
......
...@@ -21,7 +21,6 @@ def main(unused_args): ...@@ -21,7 +21,6 @@ def main(unused_args):
data = f.read() data = f.read()
input_graph_def.ParseFromString(data) input_graph_def.ParseFromString(data)
print 'done'
if FLAGS.runtime == 'dsp': if FLAGS.runtime == 'dsp':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize) input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
...@@ -38,6 +37,7 @@ def main(unused_args): ...@@ -38,6 +37,7 @@ def main(unused_args):
with gfile.GFile(FLAGS.output + '_txt', "wb") as f: with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors') # output_graph_def.ClearField('tensors')
f.write(str(output_graph_def)) f.write(str(output_graph_def))
print("Model conversion is completed.")
def parse_args(): def parse_args():
...@@ -82,7 +82,7 @@ def parse_args(): ...@@ -82,7 +82,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--output_type", "--output_type",
type=str, type=str,
default="source", default="pb",
help="output type: source/pb") help="output type: source/pb")
parser.add_argument( parser.add_argument(
"--template", "--template",
......
...@@ -44,6 +44,7 @@ class TFConverter(object): ...@@ -44,6 +44,7 @@ class TFConverter(object):
self.device = device self.device = device
self.tf_graph = {} self.tf_graph = {}
self.resolved_ops = {} self.resolved_ops = {}
self.unused_tensor = set()
for op in tf_ops: for op in tf_ops:
self.resolved_ops[op.name] = 0 self.resolved_ops[op.name] = 0
...@@ -72,6 +73,23 @@ class TFConverter(object): ...@@ -72,6 +73,23 @@ class TFConverter(object):
arg.i = self.dt arg.i = self.dt
return output_name return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_input_transform(self, name): def add_input_transform(self, name):
new_input_name = MACE_INPUT_NODE_NAME + ":0" new_input_name = MACE_INPUT_NODE_NAME + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
...@@ -111,22 +129,23 @@ class TFConverter(object): ...@@ -111,22 +129,23 @@ class TFConverter(object):
op.output_shape.extend(output_shapes) op.output_shape.extend(output_shapes)
def convert_tensor(self, op): def convert_tensor(self, op):
tensor = self.net_def.tensors.add() if op.outputs[0].name not in self.unused_tensor:
tf_tensor = op.outputs[0].eval() tensor = self.net_def.tensors.add()
tensor.name = op.outputs[0].name tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape) shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32: tf_dt = op.get_attr('dtype')
tensor.data_type = mace_pb2.DT_FLOAT if tf_dt == tf.float32:
tensor.float_data.extend(tf_tensor.astype(np.float32).flat) tensor.data_type = mace_pb2.DT_FLOAT
elif tf_dt == tf.int32: tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
tensor.data_type = mace_pb2.DT_INT32 elif tf_dt == tf.int32:
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) tensor.data_type = mace_pb2.DT_INT32
else: tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
raise Exception("Not supported tensor type: " + tf_dt.name) else:
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
def convert_conv2d(self, op): def convert_conv2d(self, op):
...@@ -253,6 +272,7 @@ class TFConverter(object): ...@@ -253,6 +272,7 @@ class TFConverter(object):
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC' data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def]) self.net_def.op.extend([op_def])
for i in range(0, 7): for i in range(0, 7):
...@@ -326,6 +346,7 @@ class TFConverter(object): ...@@ -326,6 +346,7 @@ class TFConverter(object):
axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32) axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
self.add_output_shape(op.outputs, op_def) self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 2).name)
def convert_resize_bilinear(self, op): def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
...@@ -344,6 +365,7 @@ class TFConverter(object): ...@@ -344,6 +365,7 @@ class TFConverter(object):
size_arg.i = op.get_attr('align_corners') size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def) self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_bias_add(self, op): def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef() op_def = mace_pb2.OperatorDef()
...@@ -383,6 +405,79 @@ class TFConverter(object): ...@@ -383,6 +405,79 @@ class TFConverter(object):
size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat) size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def) self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and\
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(conv_op.inputs[1].name, "FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([op.inputs[0].name])
op_def.input.extend([conv_op.inputs[1].name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_add_op.inputs[1].name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_normal_op(self, op): def convert_normal_op(self, op):
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
...@@ -407,7 +502,9 @@ class TFConverter(object): ...@@ -407,7 +502,9 @@ class TFConverter(object):
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
pass pass
elif op.type == 'Const': elif op.type == 'Const':
self.convert_tensor(op) pass
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
self.convert_conv2d(op) self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm': elif op.type == 'FusedBatchNorm':
...@@ -435,6 +532,15 @@ class TFConverter(object): ...@@ -435,6 +532,15 @@ class TFConverter(object):
else: else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu': if self.device == 'gpu':
self.add_output_transform(output_node) self.add_output_transform(output_node)
......
...@@ -10,8 +10,8 @@ from tensorflow import gfile ...@@ -10,8 +10,8 @@ from tensorflow import gfile
# Validation Flow: # Validation Flow:
# 1. Generate input data # 1. Generate input data
# python validate_icnet.py --generate_data 1 \ # python validate_icnet.py --generate_data 1
# --random_seed 1 #
# 2. Use mace_run to run icnet on phone. # 2. Use mace_run to run icnet on phone.
# 3. adb pull the result. # 3. adb pull the result.
# 4. Compare output data of mace and tf # 4. Compare output data of mace and tf
...@@ -20,7 +20,7 @@ from tensorflow import gfile ...@@ -20,7 +20,7 @@ from tensorflow import gfile
# --mace_out_file icnet.out # --mace_out_file icnet.out
def generate_data(shape): def generate_data(shape):
np.random.seed(FLAGS.random_seed) np.random.seed()
data = np.random.random(shape) * -1 data = np.random.random(shape) * -1
print FLAGS.input_file print FLAGS.input_file
data.astype(np.float32).tofile(FLAGS.input_file) data.astype(np.float32).tofile(FLAGS.input_file)
...@@ -122,12 +122,7 @@ def parse_args(): ...@@ -122,12 +122,7 @@ def parse_args():
"--generate_data", "--generate_data",
type='bool', type='bool',
default="false", default="false",
help="Random seed for generate test case.") help="Generate data or not.")
parser.add_argument(
"--random_seed",
type=int,
default="0",
help="Random seed for generate test case.")
return parser.parse_known_args() return parser.parse_known_args()
......
...@@ -79,7 +79,8 @@ build_and_run() ...@@ -79,7 +79,8 @@ build_and_run()
} }
echo "Step 1: Generate input data" echo "Step 1: Generate input data"
python tools/validate.py --generate_data true --random_seed 1 \ rm -rf ${MODEL_DIR}/${INPUT_FILE_NAME}
python tools/validate.py --generate_data true \
--input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \ --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
--input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3" --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册