提交 336f35b4 编写于 作者: L liuqi

Finish atrous convolution: faster than space_to_batch method.

上级 41a905c9
......@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
img_format,
image_shape[0], image_shape[1],
0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1]
<< "]";
return cl_image;
}
......
......@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*program = cl::Program(this->context(), {device()}, {binary});
#else
std::string source_filename = kernel_path_ + program_file_name;
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
// Create program
bool is_binary_filename_exist = std::ifstream(binary_filename).is_open();
if (is_binary_filename_exist) {
VLOG(1) << "Create program with binary: " << binary_filename;
std::vector<unsigned char> binary;
MACE_CHECK(ReadFile(binary_filename, true, &binary));
*program = cl::Program(this->context(), {device()}, {binary});
} else if (std::ifstream(source_filename).is_open()) {
if (std::ifstream(source_filename).is_open()) {
VLOG(1) << "Create program with source: " << source_filename;
std::vector<unsigned char> kernel_source;
MACE_CHECK(ReadFile(source_filename, false, &kernel_source));
......@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*program = cl::Program(this->context(), sources);
} else {
LOG(FATAL) << "Failed to open kernel file " << binary_filename << " or "
<< source_filename;
LOG(FATAL) << "Failed to open kernel file " << source_filename;
}
#endif
......@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
#ifndef MACE_EMBED_BINARY_PROGRAM
// Write binary if necessary
if (!is_binary_filename_exist) {
size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]);
cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * device_list_size,
program_binary_sizes.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) {
program_binaries[i] = std::unique_ptr<unsigned char[]>(
new unsigned char[program_binary_sizes[i]]);
}
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]);
cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * device_list_size,
program_binary_sizes.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) {
program_binaries[i] = std::unique_ptr<unsigned char[]>(
new unsigned char[program_binary_sizes[i]]);
}
err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
sizeof(unsigned char *) * device_list_size,
program_binaries.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::vector<unsigned char> content(
reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
program_binary_sizes[0]);
err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
sizeof(unsigned char *) * device_list_size,
program_binaries.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::vector<unsigned char> content(
reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
program_binary_sizes[0]);
MACE_CHECK(WriteFile(binary_filename, true, content));
}
MACE_CHECK(WriteFile(binary_filename, true, content));
#endif
}
......
......@@ -212,12 +212,14 @@ int main(int argc, char **argv) {
LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
}
MACE_CHECK(engine.Run(input_data.get(), input_shape_vec, output_data.get()));
if (output_data != nullptr) {
ofstream out_file(output_file, ios::binary);
out_file.write((const char *) (output_data.get()),
output_size * sizeof(float));
out_file.flush();
out_file.close();
LOG(INFO) << "Write output file done.";
} else {
LOG(ERROR) << "output data is null";
}
}
......@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const int filter_height,
__private const int filter_width,
__private const int padding_top,
__private const int padding_left) {
__private const int padding_left,
__private const int dilation_h,
__private const int dilation_w) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
......@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
......@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = in_width##i + mul24(width_idx, dilation_w); \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
......
......@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int out_height,
__private const int out_width,
__private const int padding_top,
__private const int padding_left) {
__private const int padding_left,
__private const int dilation_h,
__private const int dilation_w) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
......@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int hb_idx, width_idx, in_width_idx;
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
......@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
for (short width_idx = 0; width_idx < 3; ++width_idx) {
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = in_width##i + mul24(width_idx, dilation_w); \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
......
......@@ -10,29 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output, StatsFuture *future);
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output, StatsFuture *future);
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output, StatsFuture *future);
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output, StatsFuture *future);
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output,
StatsFuture *future);
const int *dilations, const DataType dt,
Tensor *output, StatsFuture *future);
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
......@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output,
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
......@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
<< ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
......@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(),
conv2d_func(input, filter, bias, false, paddings.data(), dilations_,
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, false, strides_[0],
paddings.data(), DataTypeToEnum<T>::value,
paddings.data(), dilations_, DataTypeToEnum<T>::value,
output, future);
}
......
......@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const int *dilations,
const DataType dt,
Tensor *output,
StatsFuture *future) {
......@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const int *dilations,
const DataType dt,
Tensor *output,
StatsFuture *future) {
......
......@@ -15,8 +15,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output,
StatsFuture *future) {
const int *dilations, const DataType dt,
Tensor *output, StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
conv_2d_kernel.setArg(idx++, dilations[0]);
conv_2d_kernel.setArg(idx++, dilations[1]);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
......@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const int *dilations,
const DataType dt,
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dilations, dt, output, future);
};
void Conv2dOpenclK3x3S2(const Tensor *input,
......@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const int *dilations,
const DataType dt,
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dilations, dt, output, future);
};
} // namespace kernels
......
......@@ -15,8 +15,8 @@ namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output,
StatsFuture *future) {
const int *dilations, const DataType dt,
Tensor *output, StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
conv_2d_kernel.setArg(idx++, dilations[0]);
conv_2d_kernel.setArg(idx++, dilations[1]);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
......
......@@ -10,33 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output,
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output,
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output,
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output,
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output,
StatsFuture *future);
const int *dilations, const DataType dt,
Tensor *output, StatsFuture *future);
template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
......@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output, StatsFuture *future);
const int *padding, const int *dilations,
const DataType dt, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
......@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{nullptr, nullptr}};
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
<< ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
......@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(),
conv2d_func(input, filter, bias, true, paddings.data(), dilations_,
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
DataTypeToEnum<T>::value, output, future);
dilations_, DataTypeToEnum<T>::value, output, future);
}
}
......
......@@ -2,6 +2,7 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
......@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape) {
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
srand(time(NULL));
auto func = [&](int stride_h, int stride_w, Padding padding) {
// generate random input
index_t batch = 3 + (rand() % 10);
index_t batch = 3;
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t input_channels = filter_shape[2] + (rand() % 10);
index_t output_channels = filter_shape[3] + (rand() % 10);
index_t input_channels = filter_shape[2];
index_t output_channels = filter_shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
......@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
......@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
};
for (int stride : {1, 2}) {
func(stride, stride, VALID);
func(stride, stride, SAME);
func(1, 1, VALID);
func(1, 1, SAME);
if (dilations[0] == 1) {
func(2, 2, VALID);
func(2, 2, SAME);
}
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 1, 32, 64});
{1, 1, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{3, 3, 32, 64});
{3, 3, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{15, 1, 256, 2});
{15, 1, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 15, 256, 2});
{1, 15, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 64});
{7, 7, 3, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{1, 1, 5, 7});
{1, 1, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{3, 3, 5, 7});
{3, 3, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{5, 5, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{7, 7, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
{7, 7, 16, 16},
{4, 4});
}
template<DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
2);
}
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
4);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
4);
}
......@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
{15, 1, 32, 64});
}
template<DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
}
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
}
TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
}
template<DeviceType D>
static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t input_channels = filter_shape[2];
index_t output_channels = filter_shape[3];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
};
func(1, 1, VALID);
func(1, 1, SAME);
}
TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 16},
{2, 2});
}
TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
{15, 15, 16, 16},
{2, 2});
}
......@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
Tensor::MappingGuard y_mapper(&y);
auto a = x.data<EXP_TYPE>();
auto b = y.data<RES_TYPE>();
for (int n = 0; n < x.dim(0); ++n) {
for (int h = 0; h < x.dim(1); ++h) {
for (int w = 0; w < x.dim(2); ++w) {
for (int c = 0; c < x.dim(3); ++c) {
EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
<< n << ", " << h << ", "
<< w << ", " << c << "]";
a++;
b++;
if (x.dim_size() == 4) {
for (int n = 0; n < x.dim(0); ++n) {
for (int h = 0; h < x.dim(1); ++h) {
for (int w = 0; w < x.dim(2); ++w) {
for (int c = 0; c < x.dim(3); ++c) {
EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
<< n << ", " << h << ", "
<< w << ", " << c << "]";
a++;
b++;
}
}
}
}
} else {
for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i;
}
}
}
......
......@@ -21,7 +21,6 @@ def main(unused_args):
data = f.read()
input_graph_def.ParseFromString(data)
print 'done'
if FLAGS.runtime == 'dsp':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
......@@ -38,6 +37,7 @@ def main(unused_args):
with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
print("Model conversion is completed.")
def parse_args():
......@@ -82,7 +82,7 @@ def parse_args():
parser.add_argument(
"--output_type",
type=str,
default="source",
default="pb",
help="output type: source/pb")
parser.add_argument(
"--template",
......
......@@ -44,6 +44,7 @@ class TFConverter(object):
self.device = device
self.tf_graph = {}
self.resolved_ops = {}
self.unused_tensor = set()
for op in tf_ops:
self.resolved_ops[op.name] = 0
......@@ -72,6 +73,23 @@ class TFConverter(object):
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_input_transform(self, name):
new_input_name = MACE_INPUT_NODE_NAME + ":0"
op_def = self.net_def.op.add()
......@@ -111,22 +129,23 @@ class TFConverter(object):
op.output_shape.extend(output_shapes)
def convert_tensor(self, op):
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
if op.outputs[0].name not in self.unused_tensor:
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def convert_conv2d(self, op):
......@@ -253,6 +272,7 @@ class TFConverter(object):
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
for i in range(0, 7):
......@@ -326,6 +346,7 @@ class TFConverter(object):
axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 2).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
......@@ -344,6 +365,7 @@ class TFConverter(object):
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
......@@ -383,6 +405,79 @@ class TFConverter(object):
size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and\
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(conv_op.inputs[1].name, "FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([op.inputs[0].name])
op_def.input.extend([conv_op.inputs[1].name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_add_op.inputs[1].name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
......@@ -407,7 +502,9 @@ class TFConverter(object):
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
self.convert_tensor(op)
pass
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
......@@ -435,6 +532,15 @@ class TFConverter(object):
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu':
self.add_output_transform(output_node)
......
......@@ -10,8 +10,8 @@ from tensorflow import gfile
# Validation Flow:
# 1. Generate input data
# python validate_icnet.py --generate_data 1 \
# --random_seed 1
# python validate_icnet.py --generate_data 1
#
# 2. Use mace_run to run icnet on phone.
# 3. adb pull the result.
# 4. Compare output data of mace and tf
......@@ -20,7 +20,7 @@ from tensorflow import gfile
# --mace_out_file icnet.out
def generate_data(shape):
np.random.seed(FLAGS.random_seed)
np.random.seed()
data = np.random.random(shape) * -1
print FLAGS.input_file
data.astype(np.float32).tofile(FLAGS.input_file)
......@@ -122,12 +122,7 @@ def parse_args():
"--generate_data",
type='bool',
default="false",
help="Random seed for generate test case.")
parser.add_argument(
"--random_seed",
type=int,
default="0",
help="Random seed for generate test case.")
help="Generate data or not.")
return parser.parse_known_args()
......
......@@ -79,7 +79,8 @@ build_and_run()
}
echo "Step 1: Generate input data"
python tools/validate.py --generate_data true --random_seed 1 \
rm -rf ${MODEL_DIR}/${INPUT_FILE_NAME}
python tools/validate.py --generate_data true \
--input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
--input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册