提交 cbff4266 编写于 作者: L Liangliang He

Merge branch 'winograd' into 'master'

Winograd convolution.

See merge request !223
...@@ -77,6 +77,9 @@ extern void Register_Pooling(OperatorRegistry *op_registry); ...@@ -77,6 +77,9 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
extern void Register_ResizeBilinear(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
extern void Register_Softmax(OperatorRegistry *op_registry); extern void Register_Softmax(OperatorRegistry *op_registry);
extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); extern void Register_SpaceToBatchND(OperatorRegistry *op_registry);
extern void Register_MatMul(OperatorRegistry *op_registry);
extern void Register_WinogradTransform(OperatorRegistry *op_registry);
extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry);
OperatorRegistry::OperatorRegistry() { OperatorRegistry::OperatorRegistry() {
Register_Activation(this); Register_Activation(this);
...@@ -97,6 +100,9 @@ OperatorRegistry::OperatorRegistry() { ...@@ -97,6 +100,9 @@ OperatorRegistry::OperatorRegistry() {
Register_ResizeBilinear(this); Register_ResizeBilinear(this);
Register_Softmax(this); Register_Softmax(this);
Register_SpaceToBatchND(this); Register_SpaceToBatchND(this);
Register_MatMul(this);
Register_WinogradTransform(this);
Register_WinogradInverseTransform(this);
} }
} // namespace mace } // namespace mace
...@@ -19,7 +19,7 @@ class Registry { ...@@ -19,7 +19,7 @@ class Registry {
void Register(const SrcType &key, Creator creator) { void Register(const SrcType &key, Creator creator) {
VLOG(2) << "Registering: " << key; VLOG(2) << "Registering: " << key;
std::lock_guard<std::mutex> lock(register_mutex_); std::lock_guard<std::mutex> lock(register_mutex_);
MACE_CHECK(registry_.count(key) == 0, "Key already registered."); MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key);
registry_[key] = creator; registry_[key] = creator;
} }
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_MATMUL_H_
#define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct MatMulFunctor {
void operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
C->Resize(c_shape);
const index_t N = C->dim(0);
const index_t height = C->dim(1);
const index_t width = C->dim(2);
const index_t K = A->dim(2);
Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B);
Tensor::MappingGuard guardc(C);
const T *a_ptr_base = A->data<T>();
const T *b_ptr_base = B->data<T>();
T *c_ptr = C->mutable_data<T>();
for (int i = 0; i < N; ++i) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
const T *a_ptr = a_ptr_base + h * K;
const T *b_ptr = b_ptr_base + w;
*c_ptr = 0;
for (int k = 0; k < K; ++k) {
*c_ptr += *a_ptr * *b_ptr;
a_ptr++;
b_ptr += width;
}
c_ptr++;
}
}
a_ptr_base += height * K;
b_ptr_base += K * width;
}
}
};
template <typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_MATMUL_H_
...@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
......
...@@ -17,7 +17,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -17,7 +17,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
if (input_tensors.size() > 4) { if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
output->ResizeLike(input_tensors[0]);
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -49,7 +48,7 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -49,7 +48,7 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels) static_cast<uint32_t>(batch_height_pixels)
}; };
std::vector<uint32_t> lws = {64, 16, 1}; const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" ss << "addn_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
...@@ -82,7 +81,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -82,7 +81,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor, future); AddN<T>(input_tensors, output_tensor, future);
......
...@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
......
...@@ -18,13 +18,21 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -18,13 +18,21 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape); CalImage2DShape(buffer->shape(), type, image_shape);
image->ResizeImage(buffer->shape(), image_shape); if(type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape);
} else {
image->ResizeImage(buffer->shape(), image_shape);
}
buffer->MarkUnused(); buffer->MarkUnused();
} else { } else {
image_shape = image->image_shape(); image_shape = image->image_shape();
buffer->Resize(image->shape()); buffer->Resize(image->shape());
} }
size_t gws[2] = {image_shape[0],
image_shape[1]};
string kernel_name; string kernel_name;
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
...@@ -33,12 +41,23 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -33,12 +41,23 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
case DW_CONV2D_FILTER: case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break; break;
case IN_OUT: case IN_OUT_CHANNEL:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
break; break;
case ARGUMENT: case ARGUMENT:
kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image"; kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image";
break; break;
case IN_OUT_HEIGHT:
kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
kernel_name = "in_out_width_buffer_to_image";
break;
case WINOGRAD_FILTER:
gws[1] /= 16;
kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
break;
} }
string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -68,16 +87,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -68,16 +87,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} }
b2f_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(image->buffer()))); b2f_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(image->buffer())));
const size_t gws[3] = {image_shape[0],
image_shape[1],
1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
const std::vector<uint32_t> lws = {16, 64, 1}; const std::vector<uint32_t> lws = {16, 64};
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1], lws[2]), cl::NDRange(lws[0], lws[1]),
nullptr, &event); nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
......
...@@ -233,3 +233,212 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -233,3 +233,212 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
vstore4(values, 0, output + offset); vstore4(values, 0, output + offset);
} }
} }
__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
__private const int height,
__private const int width,
__private const int channels,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int wc = width * channels;
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width;
const int channel_idx = w / width;
int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
int size = height - height_idx;
size = size >= 4 ? 0 : size;
DATA_TYPE4 values = 0;
switch(size) {
case 0:
values.w = *(input + offset + wc * 3);
case 3:
values.z = *(input + offset + wc * 2);
case 2:
values.y = *(input + offset + wc);
case 1:
values.x = *(input + offset);
}
int2 coord = (int2)(w, h);
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
__private const int channels,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width;
const int channel_idx = w / width;
int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
output[offset] = values.x;
if (height_idx + 1 >= height) return;
offset += width * channels;
output[offset] = values.y;
if (height_idx + 2 >= height) return;
offset += width * channels;
output[offset] = values.z;
if (height_idx + 3 >= height) return;
offset += width * channels;
output[offset] = values.w;
}
__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = (w % width) << 2;
const int channel_idx = w / width;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
int size = width - width_idx;
size = size >= 4 ? 0 : size;
DATA_TYPE4 values = 0;
switch(size) {
case 0:
values.w = *(input + offset + channels * 3);
case 3:
values.z = *(input + offset + channels * 2);
case 2:
values.y = *(input + offset + channels);
case 1:
values.x = *(input + offset);
}
int2 coord = (int2)(w, h);
WRITE_IMAGET(output, coord, values);
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channels = get_global_size(1);
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt;
DATA_TYPE4 tu0[4], tu1[4], tu2[4], tu3[4];
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tt = ((DATA_TYPE4)(in[0], in[9], in[18], in[27]) +
(DATA_TYPE4)(in[6], in[15], in[24], in[33])) / 2;
tu1[0] = tt + ((DATA_TYPE4)(in[3], in[12], in[21], in[30]) / 2);
tu2[0] = tt - ((DATA_TYPE4)(in[3], in[12], in[21], in[30]) / 2);
tt = ((DATA_TYPE4)(in[1], in[10], in[19], in[28]) +
(DATA_TYPE4)(in[7], in[16], in[25], in[34])) / 2;
tu1[1] = tt + ((DATA_TYPE4)(in[4], in[13], in[22], in[31]) / 2);
tu2[1] = tt - ((DATA_TYPE4)(in[4], in[13], in[22], in[31]) / 2);
tt = ((DATA_TYPE4)(in[2], in[11], in[20], in[29]) +
(DATA_TYPE4)(in[8], in[17], in[26], in[35])) / 2;
tu1[2] = tt + ((DATA_TYPE4)(in[5], in[14], in[23], in[32]) / 2);
tu2[2] = tt - ((DATA_TYPE4)(in[5], in[14], in[23], in[32]) / 2);
tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
tu3[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tu3[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tu3[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tt = (tu0[0] + tu0[2]) / 2;
tu0[3] = tu0[2];
tu0[2] = tt - tu0[1] / 2;
tu0[1] = tt + tu0[1] / 2;
tt = (tu1[0] + tu1[2]) / 2;
tu1[3] = tu1[2];
tu1[2] = tt - tu1[1] / 2;
tu1[1] = tt + tu1[1] / 2;
tt = (tu2[0] + tu2[2]) / 2;
tu2[3] = tu2[2];
tu2[2] = tt - tu2[1] / 2;
tu2[1] = tt + tu2[1] / 2;
tt = (tu3[0] + tu3[2]) / 2;
tu3[3] = tu3[2];
tu3[2] = tt - tu3[1] / 2;
tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h);
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu0[i]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu1[i]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu2[i]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu3[i]);
coord.y += out_channels;
}
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 16; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
#include <common.h>
// C = A * B
__kernel void matmul(__read_only image2d_t A,
__read_only image2d_t B,
__write_only image2d_t C,
__private const int M,
__private const int N,
__private const int K,
__private const int height_blocks,
__private const int k_blocks) {
const int gx = get_global_id(0) << 2;
const int hb = get_global_id(1);
const int batch = hb / height_blocks;
const int ty = (hb % height_blocks);
const int gy = mad24(batch, height_blocks, ty);
const int bm = mad24(batch, M, ty << 2);
const int bk = mul24(batch, k_blocks);
float4 a0, a1, a2, a3;
float4 b0, b1, b2, b3;
float4 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
for (short pos = 0; pos < k_blocks; pos += 1) {
a0 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm)));
a1 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 1)));
a2 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 2)));
a3 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 3)));
b0 = READ_IMAGET(B, SAMPLER, (int2)(gx, (bk + pos)));
b1 = READ_IMAGET(B, SAMPLER, (int2)(gx + 1, (bk + pos)));
b2 = READ_IMAGET(B, SAMPLER, (int2)(gx + 2, (bk + pos)));
b3 = READ_IMAGET(B, SAMPLER, (int2)(gx + 3, (bk + pos)));
c0 += (DATA_TYPE4)(dot(a0, b0), dot(a1, b0), dot(a2, b0), dot(a3, b0));
c1 += (DATA_TYPE4)(dot(a0, b1), dot(a1, b1), dot(a2, b1), dot(a3, b1));
c2 += (DATA_TYPE4)(dot(a0, b2), dot(a1, b2), dot(a2, b2), dot(a3, b2));
c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3));
}
WRITE_IMAGET(C, (int2)(gx, gy), c0);
if ((gx + 1) >= N) return;
WRITE_IMAGET(C, (int2)(gx + 1, gy), c1);
if ((gx + 2) >= N) return;
WRITE_IMAGET(C, (int2)(gx + 2, gy), c2);
if ((gx + 3) >= N) return;
WRITE_IMAGET(C, (int2)(gx + 3, gy), c3);
}
#include <common.h>
__kernel void winograd_transform_2x2(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const int round_w,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1);
const int chan_blk_size = get_global_size(1);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx % round_hw;
const int height_idx = ((t_idx / round_w) << 1) - padding_top;
const int width_idx = ((t_idx % round_w) << 1) - padding_left;
const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
DATA_TYPE4 input0[4];
DATA_TYPE4 input1[4];
DATA_TYPE4 input2[4];
DATA_TYPE4 input3[4];
DATA_TYPE4 tv0[4];
DATA_TYPE4 tv1[4];
DATA_TYPE4 tv2[4];
DATA_TYPE4 tv3[4];
int y = select(nh_idx, -1, height_idx < 0 || height_idx >= in_height);
#pragma unroll
for (short i = 0; i < 4; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
input0[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 1, -1, height_idx + 1 < 0 || height_idx + 1 >= in_height);
#pragma unroll
for (short i = 0; i < 4; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
input1[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 2, -1, height_idx + 2 < 0 || height_idx + 2 >= in_height);
#pragma unroll
for (short i = 0; i < 4; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
input2[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 3, -1, height_idx + 3 < 0 || height_idx + 3 >= in_height);
#pragma unroll
for (short i = 0; i < 4; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
input3[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
tv0[i] = input0[i] - input2[i];
tv1[i] = input1[i] + input2[i];
tv2[i] = input2[i] - input1[i];
tv3[i] = input1[i] - input3[i];
}
input0[0] = tv0[0] - tv0[2];
input0[1] = tv0[1] + tv0[2];
input0[2] = tv0[2] - tv0[1];
input0[3] = tv0[1] - tv0[3];
input1[0] = tv1[0] - tv1[2];
input1[1] = tv1[1] + tv1[2];
input1[2] = tv1[2] - tv1[1];
input1[3] = tv1[1] - tv1[3];
input2[0] = tv2[0] - tv2[2];
input2[1] = tv2[1] + tv2[2];
input2[2] = tv2[2] - tv2[1];
input2[3] = tv2[1] - tv2[3];
input3[0] = tv3[0] - tv3[2];
input3[1] = tv3[1] + tv3[2];
input3[2] = tv3[2] - tv3[1];
input3[3] = tv3[1] - tv3[3];
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input1[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input2[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input3[i]);
chan_blk_idx += chan_blk_size;
}
}
__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const int round_w,
__private const DATA_TYPE relux_max_limit,
__private const DATA_TYPE prelu_alpha) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int out_channel = get_global_size(1);
int width = width_idx;
int height = height_idx;
const int batch = width_idx / round_hw;
int t = width_idx % round_hw;
const int out_height_idx = (t / round_w) << 1;
const int out_width_idx = (t % round_w) << 1;
const int out_chan_idx = height_idx;
const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx);
#ifdef BIAS
DATA_TYPE4 bias_value =
READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
#endif
DATA_TYPE4 in0[4], in1[4], in2[4], in3[4];
#pragma unroll
for (short i = 0; i < 4; ++i) {
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width, height));
height += out_channel;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height));
height += out_channel;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height));
height += out_channel;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height));
height += out_channel;
}
in0[0] = in0[0] + in1[0] + in2[0];
in0[1] = in0[1] + in1[1] + in2[1];
in0[2] = in0[2] + in1[2] + in2[2];
in0[3] = in0[3] + in1[3] + in2[3];
in0[0] = in0[0] + in0[1] + in0[2];
in0[1] = in0[1] - in0[2] - in0[3];
in1[0] = in1[0] - in2[0] - in3[0];
in1[1] = in1[1] - in2[1] - in3[1];
in1[2] = in1[2] - in2[2] - in3[2];
in1[3] = in1[3] - in2[3] - in3[3];
in1[0] = in1[0] + in1[1] + in1[2];
in1[1] = in1[1] - in1[2] - in1[3];
#ifdef BIAS
in0[0] += bias_value;
in0[1] += bias_value;
in1[0] += bias_value;
in1[1] += bias_value;
#endif
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
in0[0] = do_activation(in0[0], relux_max_limit, prelu_alpha);
in0[1] = do_activation(in0[1], relux_max_limit, prelu_alpha);
in1[0] = do_activation(in1[0], relux_max_limit, prelu_alpha);
in1[1] = do_activation(in1[1], relux_max_limit, prelu_alpha);
#endif
WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]);
t = 0;
if (out_width_idx + 1 < out_width) {
WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y), in0[1]);
t += 1;
}
if (out_height_idx + 1 < out_height) {
WRITE_IMAGET(output, (int2)(coord_x, coord_y + 1), in1[0]);
t += 1;
}
if (t == 2) {
WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]);
}
}
...@@ -50,7 +50,7 @@ static void Concat2(const Tensor *input0, ...@@ -50,7 +50,7 @@ static void Concat2(const Tensor *input0,
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" ss << "concat_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
...@@ -85,7 +85,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te ...@@ -85,7 +85,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output->ResizeImage(output_shape, image_shape); output->ResizeImage(output_shape, image_shape);
switch (inputs_count) { switch (inputs_count) {
......
...@@ -109,7 +109,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -109,7 +109,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
paddings_, output_shape.data(), paddings.data()); paddings_, output_shape.data(), paddings.data());
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
......
...@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input, ...@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 15, 8, 1}; const std::vector<uint32_t> lws = {8, 15, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input, ...@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {4, 15, 8, 1}; const std::vector<uint32_t> lws = {4, 15, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -97,7 +97,7 @@ void Conv2dOpencl(const Tensor *input, ...@@ -97,7 +97,7 @@ void Conv2dOpencl(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -106,7 +106,7 @@ void DepthwiseConv2d(const Tensor *input, // NHWC ...@@ -106,7 +106,7 @@ void DepthwiseConv2d(const Tensor *input, // NHWC
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier); batch, height, width, channels, multiplier);
TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future);
...@@ -150,7 +150,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -150,7 +150,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_, output_shape.data(), paddings.data()); padding_, output_shape.data(), paddings.data());
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_, DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_,
......
...@@ -45,6 +45,34 @@ void CalArgImageShape(const std::vector<index_t> &shape, ...@@ -45,6 +45,34 @@ void CalArgImageShape(const std::vector<index_t> &shape,
image_shape[1] = 1; image_shape[1] = 1;
} }
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]);
image_shape[1] = (shape[0] << 4);
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[2] * shape[3];
image_shape[1] = shape[0] * RoundUpDiv4(shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[2]) * shape[3];
image_shape[1] = shape[0] * shape[1];
}
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
...@@ -55,13 +83,39 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -55,13 +83,39 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
case DW_CONV2D_FILTER: case DW_CONV2D_FILTER:
CalDepthwiseConv2dFilterImageShape(shape, image_shape); CalDepthwiseConv2dFilterImageShape(shape, image_shape);
break; break;
case IN_OUT: case IN_OUT_CHANNEL:
CalInOutputImageShape(shape, image_shape); CalInOutputImageShape(shape, image_shape);
break; break;
case ARGUMENT: case ARGUMENT:
CalArgImageShape(shape, image_shape); CalArgImageShape(shape, image_shape);
break; break;
default:LOG(FATAL) << "Mace not supported yet."; case IN_OUT_HEIGHT:
CalInOutHeightImageShape(shape, image_shape);
break;
case IN_OUT_WIDTH:
CalInOutWidthImageShape(shape, image_shape);
break;
case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) {
if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1};
}else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] *
((shape[1] - 1) / 2) *
((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
} }
} }
...@@ -104,7 +158,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -104,7 +158,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
void TuningOrRun3DKernel(cl::Kernel &kernel, void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
...@@ -201,7 +255,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -201,7 +255,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
......
...@@ -18,15 +18,21 @@ const float kMaxKernelExeTime = 1000.0; // microseconds ...@@ -18,15 +18,21 @@ const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType { enum BufferType {
CONV2D_FILTER = 0, CONV2D_FILTER = 0,
DW_CONV2D_FILTER = 1, IN_OUT_CHANNEL = 1,
IN_OUT = 2, ARGUMENT = 2,
ARGUMENT = 3 IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
}; };
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape); std::vector<size_t> &image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type);
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
std::string DtToUpstreamCLCMDDt(const DataType dt); std::string DtToUpstreamCLCMDDt(const DataType dt);
...@@ -38,14 +44,14 @@ std::string DtToUpstreamCLDt(const DataType dt); ...@@ -38,14 +44,14 @@ std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(cl::Kernel &kernel, void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) { inline void SetFuture(StatsFuture *future, const cl::Event &event) {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/matmul.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
C->ResizeImage(c_shape, c_image_shape);
const index_t batch = C->dim(0);
const index_t height = C->dim(1);
const index_t width = C->dim(2);
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto matmul_kernel = runtime->BuildKernel("matmul", kernel_name, built_options);
uint32_t idx = 0;
matmul_kernel.setArg(idx++,
*(static_cast<const cl::Image2D *>(A->buffer())));
matmul_kernel.setArg(idx++,
*(static_cast<const cl::Image2D *>(B->buffer())));
matmul_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(C->buffer())));
matmul_kernel.setArg(idx++, static_cast<int>(height));
matmul_kernel.setArg(idx++, static_cast<int>(width));
matmul_kernel.setArg(idx++, static_cast<int>(A->dim(2)));
matmul_kernel.setArg(idx++, static_cast<int>(height_blocks));
matmul_kernel.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss;
ss << "matmul_opencl_kernel_"
<< C->dim(0) << "_"
<< C->dim(1) << "_"
<< C->dim(2) << "_"
<< C->dim(3);
TuningOrRun2DKernel(matmul_kernel, ss.str(), gws, lws, future);
};
template
struct MatMulFunctor<DeviceType::OPENCL, float>;
template
struct MatMulFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
...@@ -92,7 +92,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -92,7 +92,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
output_shape.data(), paddings.data()); output_shape.data(), paddings.data());
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
......
...@@ -28,7 +28,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -28,7 +28,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape {batch, out_height, out_width, channels}; std::vector<index_t> output_shape {batch, out_height, out_width, channels};
if (input->is_image()) { if (input->is_image()) {
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
} else { } else {
output->Resize(output_shape); output->Resize(output_shape);
...@@ -59,7 +59,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,7 +59,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
......
...@@ -41,7 +41,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -41,7 +41,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" ss << "softmax_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
......
...@@ -21,7 +21,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -21,7 +21,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
const char *kernel_name = nullptr; const char *kernel_name = nullptr;
if (b2s_) { if (b2s_) {
space_tensor->ResizeImage(output_shape, output_image_shape); space_tensor->ResizeImage(output_shape, output_image_shape);
...@@ -61,7 +61,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -61,7 +61,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
const uint32_t gws[3] = {chan_blk, const uint32_t gws[3] = {chan_blk,
static_cast<uint32_t>(batch_tensor->dim(2)), static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" ss << kernel_name << "_"
<< batch_tensor->dim(0) << "_" << batch_tensor->dim(0) << "_"
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/winograd_transform.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
template<typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2);
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
strides_.data(), paddings_, output_shape.data(), paddings.data());
const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
auto wino_kernel = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options);
uint32_t idx = 0;
wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
wino_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
wino_kernel.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
wino_kernel.setArg(idx++, static_cast<uint32_t>(round_w));
wino_kernel.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
wino_kernel.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
const uint32_t gws[2] = {static_cast<size_t>(out_width),
static_cast<size_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future);
}
template<typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
defeult:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
auto runtime = OpenCLRuntime::Global();
auto wino_kernel = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options);
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
if (bias != nullptr) {
wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
wino_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
wino_kernel.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
wino_kernel.setArg(idx++, static_cast<uint32_t>(round_w));
wino_kernel.setArg(idx++, relux_max_limit_);
wino_kernel.setArg(idx++, prelu_alpha_);
const uint32_t gws[2] = {static_cast<size_t>(input_tensor->dim(2)),
static_cast<size_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_inverse_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future);
}
template
struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
namespace mace {
namespace kernels {
struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &paddings)
: strides_({1, 1}), dilations_({1, 1}), paddings_(paddings) {}
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &paddings)
: WinogradTransformFunctorBase(paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &paddings)
: WinogradTransformFunctorBase(paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
};
struct WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctorBase(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha)
: batch_(batch),
height_(height),
width_(width),
activation_(activation),
relux_max_limit_(relux_max_limit),
prelu_alpha_(prelu_alpha) {}
const int batch_;
const int height_;
const int width_;
const ActivationType activation_;
const float relux_max_limit_;
const float prelu_alpha_;
};
template<DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit,
const float prelu_alpha)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_WINOGRAD_TRANSFORM_H_
...@@ -20,7 +20,7 @@ static void ReluBenchmark( ...@@ -20,7 +20,7 @@ static void ReluBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluBM") OpDefBuilder("Activation", "ReluBM")
.Input("InputImage") .Input("InputImage")
...@@ -79,7 +79,7 @@ static void ReluxBenchmark( ...@@ -79,7 +79,7 @@ static void ReluxBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxBM") OpDefBuilder("Activation", "ReluxBM")
.Input("InputImage") .Input("InputImage")
...@@ -140,7 +140,7 @@ static void PreluBenchmark( ...@@ -140,7 +140,7 @@ static void PreluBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "PreluBM") OpDefBuilder("Activation", "PreluBM")
.Input("InputImage") .Input("InputImage")
...@@ -201,7 +201,7 @@ static void TanhBenchmark( ...@@ -201,7 +201,7 @@ static void TanhBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhBM") OpDefBuilder("Activation", "TanhBM")
.Input("InputImage") .Input("InputImage")
...@@ -260,7 +260,7 @@ static void SigmoidBenchmark( ...@@ -260,7 +260,7 @@ static void SigmoidBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidBM") OpDefBuilder("Activation", "SigmoidBM")
.Input("InputImage") .Input("InputImage")
......
...@@ -20,7 +20,7 @@ void TestSimpleRelu() { ...@@ -20,7 +20,7 @@ void TestSimpleRelu() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("InputImage") .Input("InputImage")
...@@ -33,7 +33,7 @@ void TestSimpleRelu() { ...@@ -33,7 +33,7 @@ void TestSimpleRelu() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("Input") .Input("Input")
...@@ -70,7 +70,7 @@ void TestUnalignedSimpleRelu() { ...@@ -70,7 +70,7 @@ void TestUnalignedSimpleRelu() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("InputImage") .Input("InputImage")
...@@ -83,7 +83,7 @@ void TestUnalignedSimpleRelu() { ...@@ -83,7 +83,7 @@ void TestUnalignedSimpleRelu() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("Input") .Input("Input")
...@@ -125,7 +125,7 @@ void TestSimpleRelux() { ...@@ -125,7 +125,7 @@ void TestSimpleRelux() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage") .Input("InputImage")
...@@ -139,7 +139,7 @@ void TestSimpleRelux() { ...@@ -139,7 +139,7 @@ void TestSimpleRelux() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("Input") .Input("Input")
...@@ -179,7 +179,7 @@ void TestSimpleReluRelux() { ...@@ -179,7 +179,7 @@ void TestSimpleReluRelux() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage") .Input("InputImage")
...@@ -193,7 +193,7 @@ void TestSimpleReluRelux() { ...@@ -193,7 +193,7 @@ void TestSimpleReluRelux() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("Input") .Input("Input")
...@@ -237,7 +237,7 @@ void TestSimplePrelu() { ...@@ -237,7 +237,7 @@ void TestSimplePrelu() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("InputImage") .Input("InputImage")
...@@ -251,7 +251,7 @@ void TestSimplePrelu() { ...@@ -251,7 +251,7 @@ void TestSimplePrelu() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("Input") .Input("Input")
...@@ -293,7 +293,7 @@ void TestSimpleTanh() { ...@@ -293,7 +293,7 @@ void TestSimpleTanh() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhTest") OpDefBuilder("Activation", "TanhTest")
.Input("InputImage") .Input("InputImage")
...@@ -306,7 +306,7 @@ void TestSimpleTanh() { ...@@ -306,7 +306,7 @@ void TestSimpleTanh() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "TanhTest") OpDefBuilder("Activation", "TanhTest")
.Input("Input") .Input("Input")
...@@ -348,7 +348,7 @@ void TestSimpleSigmoid() { ...@@ -348,7 +348,7 @@ void TestSimpleSigmoid() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidTest") OpDefBuilder("Activation", "SigmoidTest")
.Input("InputImage") .Input("InputImage")
...@@ -361,7 +361,7 @@ void TestSimpleSigmoid() { ...@@ -361,7 +361,7 @@ void TestSimpleSigmoid() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Activation", "SigmoidTest") OpDefBuilder("Activation", "SigmoidTest")
.Input("Input") .Input("Input")
......
...@@ -23,7 +23,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -23,7 +23,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(), BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
internal::MakeString("InputImage", i).c_str(), internal::MakeString("InputImage", i).c_str(),
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} }
OpDefBuilder op_def_builder("AddN", "AddNBM"); OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
......
...@@ -104,7 +104,7 @@ void RandomTest() { ...@@ -104,7 +104,7 @@ void RandomTest() {
for (int i = 0; i < input_num; ++i) { for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(net, "Input" + ToString(i), BufferToImage<D, half>(net, "Input" + ToString(i),
"InputImage" + ToString(i), "InputImage" + ToString(i),
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto op_def_cl = OpDefBuilder("AddN", "AddNTest"); auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
...@@ -119,7 +119,7 @@ void RandomTest() { ...@@ -119,7 +119,7 @@ void RandomTest() {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
} }
......
...@@ -24,7 +24,7 @@ static void BatchNorm( ...@@ -24,7 +24,7 @@ static void BatchNorm(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Scale", "ScaleImage", BufferToImage<D, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", BufferToImage<D, float>(net, "Offset", "OffsetImage",
......
...@@ -23,7 +23,7 @@ void Simple() { ...@@ -23,7 +23,7 @@ void Simple() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Scale", "ScaleImage", BufferToImage<D, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", BufferToImage<D, float>(net, "Offset", "OffsetImage",
...@@ -47,7 +47,7 @@ void Simple() { ...@@ -47,7 +47,7 @@ void Simple() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input") .Input("Input")
...@@ -204,7 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -204,7 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
...@@ -234,7 +234,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -234,7 +234,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
...@@ -276,7 +276,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -276,7 +276,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
...@@ -307,7 +307,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -307,7 +307,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
} }
...@@ -349,7 +349,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -349,7 +349,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
...@@ -379,7 +379,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -379,7 +379,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
...@@ -421,7 +421,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -421,7 +421,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
...@@ -452,7 +452,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -452,7 +452,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
} }
} }
...@@ -15,7 +15,7 @@ static void BMBatchToSpace( ...@@ -15,7 +15,7 @@ static void BMBatchToSpace(
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
......
...@@ -20,7 +20,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -20,7 +20,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddBM") OpDefBuilder("BiasAdd", "BiasAddBM")
......
...@@ -20,7 +20,7 @@ void BiasAddSimple() { ...@@ -20,7 +20,7 @@ void BiasAddSimple() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Bias", "BiasImage", BufferToImage<D, float>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -34,7 +34,7 @@ void BiasAddSimple() { ...@@ -34,7 +34,7 @@ void BiasAddSimple() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input") .Input("Input")
...@@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Bias", "BiasImage", BufferToImage<DeviceType::OPENCL, float>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -105,7 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -105,7 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
...@@ -140,7 +140,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -140,7 +140,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Bias", "BiasImage", BufferToImage<DeviceType::OPENCL, float>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
...@@ -155,7 +155,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -155,7 +155,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
} }
...@@ -55,23 +55,23 @@ TEST(BufferToImageTest, ArgLarge) { ...@@ -55,23 +55,23 @@ TEST(BufferToImageTest, ArgLarge) {
} }
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 1}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 3}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 2, 3, 3}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedia) { TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 13, 17, 128}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 64, 64, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
...@@ -124,7 +124,7 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> ...@@ -124,7 +124,7 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
net.RunOp(D); net.RunOp(D);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3); ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
} }
TEST(BufferToImageTest, ArgFloatToHalfSmall) { TEST(BufferToImageTest, ArgFloatToHalfSmall) {
......
...@@ -61,9 +61,9 @@ static void OpenclConcatHelper(int iters, ...@@ -61,9 +61,9 @@ static void OpenclConcatHelper(int iters,
net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1); net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1);
BufferToImage<DeviceType::OPENCL, T>(net, "Input0", "InputImage0", BufferToImage<DeviceType::OPENCL, T>(net, "Input0", "InputImage0",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(net, "Input1", "InputImage1", BufferToImage<DeviceType::OPENCL, T>(net, "Input1", "InputImage1",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM") OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0") .Input("InputImage0")
.Input("InputImage1") .Input("InputImage1")
......
...@@ -153,7 +153,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -153,7 +153,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
net.AddRandomInput<DeviceType::OPENCL, float>(input_name, shapes[i]); net.AddRandomInput<DeviceType::OPENCL, float>(input_name, shapes[i]);
BufferToImage<DeviceType::OPENCL, T>(net, input_name, image_name, BufferToImage<DeviceType::OPENCL, T>(net, input_name, image_name,
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
...@@ -170,7 +170,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -170,7 +170,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "Output", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto output = net.GetOutput("Output"); auto output = net.GetOutput("Output");
......
...@@ -34,7 +34,7 @@ static void Conv2d(int iters, ...@@ -34,7 +34,7 @@ static void Conv2d(int iters,
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -97,17 +97,21 @@ static void Conv2d(int iters, ...@@ -97,17 +97,21 @@ static void Conv2d(int iters,
// ICNet // ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half); BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 //// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half); BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 //// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 //// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half); BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half); BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);
BM_CONV_2D(1, 128, 16, 16, 3, 3, 1, VALID, 32, half);
BM_CONV_2D(1, 128, 64, 64, 3, 3, 1, VALID, 32, half);
BM_CONV_2D(1, 128, 128, 128, 3, 3, 1, VALID, 32, half);
// Test RGB <-> YUV // Test RGB <-> YUV
// BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); // BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
// BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); // BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
......
...@@ -100,7 +100,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -100,7 +100,7 @@ void TestNHWCSimple3x3VALID() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -120,7 +120,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -120,7 +120,7 @@ void TestNHWCSimple3x3VALID() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -177,7 +177,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -177,7 +177,7 @@ void TestNHWCSimple3x3SAME() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -262,7 +262,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -262,7 +262,7 @@ void TestNHWCSimple3x3WithoutBias() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
...@@ -279,7 +279,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -279,7 +279,7 @@ void TestNHWCSimple3x3WithoutBias() {
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
...@@ -369,7 +369,7 @@ static void TestNHWCCombined3x3() { ...@@ -369,7 +369,7 @@ static void TestNHWCCombined3x3() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -389,7 +389,7 @@ static void TestNHWCCombined3x3() { ...@@ -389,7 +389,7 @@ static void TestNHWCCombined3x3() {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input") .Input("Input")
...@@ -442,7 +442,7 @@ void TestConv1x1() { ...@@ -442,7 +442,7 @@ void TestConv1x1() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Filter", "FilterImage", BufferToImage<D, float>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", BufferToImage<D, float>(net, "Bias", "BiasImage",
...@@ -461,7 +461,7 @@ void TestConv1x1() { ...@@ -461,7 +461,7 @@ void TestConv1x1() {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input") .Input("Input")
...@@ -533,7 +533,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -533,7 +533,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -553,7 +553,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -553,7 +553,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -626,7 +626,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -626,7 +626,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
// run on gpu // run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", BufferToImage<D, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Filter", "FilterImage", BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", BufferToImage<D, half>(net, "Bias", "BiasImage",
...@@ -646,7 +646,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -646,7 +646,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}; };
...@@ -758,7 +758,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -758,7 +758,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
...@@ -775,7 +775,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -775,7 +775,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
......
...@@ -26,7 +26,7 @@ void SimpleValidTest() { ...@@ -26,7 +26,7 @@ void SimpleValidTest() {
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}); net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -46,7 +46,7 @@ void SimpleValidTest() { ...@@ -46,7 +46,7 @@ void SimpleValidTest() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
...@@ -129,7 +129,7 @@ void ComplexValidTest() { ...@@ -129,7 +129,7 @@ void ComplexValidTest() {
{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -149,7 +149,7 @@ void ComplexValidTest() { ...@@ -149,7 +149,7 @@ void ComplexValidTest() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
...@@ -239,7 +239,7 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -239,7 +239,7 @@ void TestNxNS12(const index_t height, const index_t width) {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -259,7 +259,7 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -259,7 +259,7 @@ void TestNxNS12(const index_t height, const index_t width) {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input") .Input("Input")
......
...@@ -34,7 +34,7 @@ static void DepthwiseConv2d(int iters, ...@@ -34,7 +34,7 @@ static void DepthwiseConv2d(int iters,
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
......
...@@ -7,10 +7,11 @@ ...@@ -7,10 +7,11 @@
namespace mace { namespace mace {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") REGISTER_OPERATOR(op_registry,
.Device(DeviceType::CPU) OpKeyBuilder("FoldedBatchNorm")
.TypeConstraint<float>("T") .Device(DeviceType::CPU)
.Build(), .TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::CPU, float>); FoldedBatchNormOp<DeviceType::CPU, float>);
#if MACE_ENABLE_NEON #if MACE_ENABLE_NEON
...@@ -21,16 +22,18 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { ...@@ -21,16 +22,18 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
FoldedBatchNormOp<DeviceType::NEON, float>); FoldedBatchNormOp<DeviceType::NEON, float>);
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") REGISTER_OPERATOR(op_registry,
.Device(DeviceType::OPENCL) OpKeyBuilder("FoldedBatchNorm")
.TypeConstraint<float>("T") .Device(DeviceType::OPENCL)
.Build(), .TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>); FoldedBatchNormOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") REGISTER_OPERATOR(op_registry,
.Device(DeviceType::OPENCL) OpKeyBuilder("FoldedBatchNorm")
.TypeConstraint<half>("T") .Device(DeviceType::OPENCL)
.Build(), .TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>); FoldedBatchNormOp<DeviceType::OPENCL, half>);
} }
......
...@@ -38,7 +38,7 @@ void Simple() { ...@@ -38,7 +38,7 @@ void Simple() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Scale", "ScaleImage", BufferToImage<D, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", BufferToImage<D, float>(net, "Offset", "OffsetImage",
...@@ -55,7 +55,7 @@ void Simple() { ...@@ -55,7 +55,7 @@ void Simple() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input") .Input("Input")
...@@ -204,7 +204,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -204,7 +204,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
...@@ -222,7 +222,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -222,7 +222,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
...@@ -259,7 +259,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -259,7 +259,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
...@@ -278,7 +278,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -278,7 +278,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
} }
...@@ -315,7 +315,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -315,7 +315,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
...@@ -332,7 +332,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -332,7 +332,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
} }
...@@ -369,7 +369,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -369,7 +369,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage", BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage", BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage", BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
...@@ -387,7 +387,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -387,7 +387,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
} }
} }
...@@ -24,7 +24,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -24,7 +24,7 @@ void TestNHWCSimple3x3VALID() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -44,7 +44,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -44,7 +44,7 @@ void TestNHWCSimple3x3VALID() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
...@@ -81,7 +81,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -81,7 +81,7 @@ void TestNHWCSimple3x3SAME() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -101,7 +101,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -101,7 +101,7 @@ void TestNHWCSimple3x3SAME() {
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
...@@ -149,7 +149,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -149,7 +149,7 @@ void TestNHWCSimple3x3WithoutBias() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
...@@ -166,7 +166,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -166,7 +166,7 @@ void TestNHWCSimple3x3WithoutBias() {
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", ImageToBuffer<D, T>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input") .Input("Input")
...@@ -218,7 +218,7 @@ void TestConv1x1() { ...@@ -218,7 +218,7 @@ void TestConv1x1() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(net, "Filter", "FilterImage", BufferToImage<D, float>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", BufferToImage<D, float>(net, "Bias", "BiasImage",
...@@ -237,7 +237,7 @@ void TestConv1x1() { ...@@ -237,7 +237,7 @@ void TestConv1x1() {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input") .Input("Input")
...@@ -309,7 +309,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -309,7 +309,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -329,7 +329,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -329,7 +329,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -395,7 +395,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -395,7 +395,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
// run on gpu // run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", BufferToImage<D, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Filter", "FilterImage", BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", BufferToImage<D, half>(net, "Bias", "BiasImage",
...@@ -415,7 +415,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -415,7 +415,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
}; };
...@@ -473,7 +473,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape, ...@@ -473,7 +473,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage",
...@@ -493,7 +493,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape, ...@@ -493,7 +493,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -550,7 +550,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat ...@@ -550,7 +550,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
...@@ -567,7 +567,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat ...@@ -567,7 +567,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -632,7 +632,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -632,7 +632,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
...@@ -649,7 +649,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -649,7 +649,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
}; };
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/matmul.h"
namespace mace {
void Register_MatMul(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
MatMulOp<DeviceType::OPENCL, half>);
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_MATMUL_H_
#define MACE_OPS_MATMUL_H_
#include "mace/core/operator.h"
#include "mace/kernels/matmul.h"
namespace mace {
template <DeviceType D, class T>
class MatMulOp : public Operator<D, T> {
public:
MatMulOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run(StatsFuture *future) override {
const Tensor *A = this->Input(0);
const Tensor *B = this->Input(1);
Tensor *C = this->Output(0);
MACE_CHECK(A->dim_size() == 4 && 4 == B->dim_size())
<< "The dimension of A and B should be 4";
MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size";
MACE_CHECK(A->dim(2) == B->dim(1))
<< "the number of A's column " << A->dim(2)
<< " must be equal to B's row " << B->dim(1);
functor_(A, B, C, future);
return true;
}
private:
kernels::MatMulFunctor<D, T> functor_;
};
} // namespace mace
#endif // MACE_OPS_MATMUL_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void MatMulBenchmark(
int iters, int batch, int height, int channels, int out_width) {
mace::testing::StopTiming();
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("A", {batch, height, channels, 1});
net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulBM")
.Input("AImage")
.Input("BImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("MatMul", "MatMulBM")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W, TYPE) \
BM_MATMUL_MACRO(N, H, C, W, TYPE, OPENCL);
BM_MATMUL(16, 32, 128, 49, half);
BM_MATMUL(16, 32, 128, 961, half);
BM_MATMUL(16, 32, 128, 3969, half);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
class MatMulOpTest : public OpsTestBase {};
template<DeviceType D>
void Simple(const std::vector<index_t> &A_shape,
const std::vector<float> &A_value,
const std::vector<index_t> &B_shape,
const std::vector<float> &B_value,
const std::vector<index_t> &C_shape,
const std::vector<float> &C_value) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>("A", A_shape, A_value);
net.AddInputFromArray<D, float>("B", B_shape, B_value);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, float>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest")
.Input("AImage")
.Input("BImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_HEIGHT);
} else {
OpDefBuilder("MatMul", "MatMulTest")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected =
CreateTensor<float>(C_shape, C_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(MatMulOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
{1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::CPU>({1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
}
TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
{1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::OPENCL>({1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
}
template <typename T>
void Complex(const index_t batch,
const index_t height,
const index_t channels,
const index_t out_width) {
srand(time(NULL));
// Construct graph
OpsTestNet net;
OpDefBuilder("MatMul", "MatMulTest")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"A", {batch, height, channels, 1});
net.AddRandomInput<DeviceType::OPENCL, float>(
"B", {batch, channels, out_width, 1});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, T>(net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<DeviceType::OPENCL, T>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest")
.Input("AImage")
.Input("BImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4);
}
}
TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
Complex<float>(1, 64, 128, 32);
Complex<float>(1, 64, 32, 128);
}
TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) {
Complex<float>(1, 31, 113, 61);
Complex<float>(1, 113, 31, 73);
}
TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) {
Complex<float>(2, 3, 3, 3);
Complex<float>(16, 31, 61, 67);
Complex<float>(31, 31, 61, 67);
}
TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) {
Complex<half>(1, 64, 128, 32);
Complex<half>(1, 64, 32, 128);
}
TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(2, 31, 113, 61);
Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67);
}
}
...@@ -134,7 +134,7 @@ static void SimpleMaxPooling3S2() { ...@@ -134,7 +134,7 @@ static void SimpleMaxPooling3S2() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -146,7 +146,7 @@ static void SimpleMaxPooling3S2() { ...@@ -146,7 +146,7 @@ static void SimpleMaxPooling3S2() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
// Run // Run
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
...@@ -198,7 +198,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -198,7 +198,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -211,7 +211,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -211,7 +211,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
} }
...@@ -283,7 +283,7 @@ static void SimpleAvgPoolingTest() { ...@@ -283,7 +283,7 @@ static void SimpleAvgPoolingTest() {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -296,7 +296,7 @@ static void SimpleAvgPoolingTest() { ...@@ -296,7 +296,7 @@ static void SimpleAvgPoolingTest() {
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
...@@ -333,7 +333,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -333,7 +333,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -346,7 +346,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -346,7 +346,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01); ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
} }
......
...@@ -27,7 +27,7 @@ static void ResizeBilinearBenchmark(int iters, ...@@ -27,7 +27,7 @@ static void ResizeBilinearBenchmark(int iters,
{output_height, output_width}); {output_height, output_width});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
.Input("InputImage") .Input("InputImage")
.Input("OutSize") .Input("OutSize")
......
...@@ -92,7 +92,7 @@ void TestRandomResizeBilinear() { ...@@ -92,7 +92,7 @@ void TestRandomResizeBilinear() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputImage") .Input("InputImage")
...@@ -104,7 +104,7 @@ void TestRandomResizeBilinear() { ...@@ -104,7 +104,7 @@ void TestRandomResizeBilinear() {
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
// TODO support NEON // TODO support NEON
} }
......
...@@ -20,7 +20,7 @@ static void SoftmaxBenchmark( ...@@ -20,7 +20,7 @@ static void SoftmaxBenchmark(
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxBM") OpDefBuilder("Softmax", "SoftmaxBM")
.Input("InputImage") .Input("InputImage")
......
...@@ -18,7 +18,7 @@ void Simple() { ...@@ -18,7 +18,7 @@ void Simple() {
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage") .Input("InputImage")
...@@ -30,7 +30,7 @@ void Simple() { ...@@ -30,7 +30,7 @@ void Simple() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input") .Input("Input")
...@@ -72,7 +72,7 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -72,7 +72,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage") .Input("InputImage")
...@@ -84,7 +84,7 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -84,7 +84,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
// Transfer output // Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
......
...@@ -16,7 +16,7 @@ static void BMSpaceToBatch( ...@@ -16,7 +16,7 @@ static void BMSpaceToBatch(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
......
...@@ -18,7 +18,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape, ...@@ -18,7 +18,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -30,7 +30,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape, ...@@ -30,7 +30,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8);
} }
...@@ -46,7 +46,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape, ...@@ -46,7 +46,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -58,7 +58,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape, ...@@ -58,7 +58,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8);
} }
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
class WinogradConvlutionTest : public OpsTestBase {};
void TransposeFilter(const std::vector<float> &input,
const std::vector<index_t> &input_shape,
std::vector<float> &output) {
output.resize(input.size());
const float *input_ptr = input.data();
for (index_t h = 0; h < input_shape[0]; ++h) {
for (index_t w = 0; w < input_shape[1]; ++w) {
for (index_t ic = 0; ic < input_shape[2]; ++ic) {
for (index_t oc = 0; oc < input_shape[3]; ++oc) {
int offset = ((oc * input_shape[2] + ic) * input_shape[0] + h) * input_shape[1] + w;
output[offset] = *input_ptr;
++input_ptr;
}
}
}
}
}
template<DeviceType D, typename T>
void WinogradConvolution(const index_t batch,
const index_t height,
const index_t width,
const index_t in_channels,
const index_t out_channels,
const Padding padding) {
srand(time(NULL));
// Construct graph
OpsTestNet net;
// Add input data
std::vector<float> filter_data;
std::vector<index_t> filter_shape = {3, 3, in_channels, out_channels};
GenerateRandomRealTypeData<float>(filter_shape, filter_data);
net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
net.AddRandomInput<D, T>("Bias", {out_channels});
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape();
// Winograd convolution
// transform filter
std::vector<float> wino_filter_data;
TransposeFilter(filter_data, filter_shape, wino_filter_data);
net.AddInputFromArray<D, float>("WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("WinoInput")
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(D);
// MatMul
OpDefBuilder("MatMul", "MatMulTest")
.Input("WinoFilter")
.Input("WinoInput")
.Output("WinoGemm")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(D);
// Inverse transform
OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
.Input("WinoGemm")
.Input("BiasImage")
.AddIntArg("batch", batch)
.AddIntArg("height", output_shape[1])
.AddIntArg("width", output_shape[2])
.Output("WinoOutputImage")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(D);
net.Sync();
ImageToBuffer<D, float>(net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-1);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-4);
}
}
TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::SAME);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31, Padding::SAME);
}
TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31, Padding::SAME);
}
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/winograd_inverse_transform.h"
namespace mace {
void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::OPENCL, half>);
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
#define MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/winograd_transform.h"
#include "mace/kernels/activation.h"
namespace mace {
template<DeviceType D, typename T>
class WinogradInverseTransformOp : public Operator<D, T> {
public:
WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("batch", 1),
OperatorBase::GetSingleArgument<int>("height", 0),
OperatorBase::GetSingleArgument<int>("width", 0),
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *bias = this->InputSize() == 2 ? this->Input(BIAS) : nullptr;
Tensor *output_tensor = this->Output(OUTPUT);
functor_(input_tensor, bias, output_tensor, future);
return true;
}
private:
kernels::WinogradInverseTransformFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/winograd_transform.h"
namespace mace {
void Register_WinogradTransform(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
WinogradTransformOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
WinogradTransformOp<DeviceType::OPENCL, half>);
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_WINOGRAD_TRANSFORM_H_
#define MACE_OPS_WINOGRAD_TRANSFORM_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/winograd_transform.h"
namespace mace {
template<DeviceType D, typename T>
class WinogradTransformOp : public Operator<D, T> {
public:
WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(static_cast<Padding>(OperatorBase::GetSingleArgument<int>(
"padding", static_cast<int>(VALID)))) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output_tensor = this->Output(OUTPUT);
functor_(input_tensor, output_tensor, future);
return true;
}
private:
kernels::WinogradTransformFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_WINOGRAD_TRANSFORM_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void BMWinogradTransform(
int iters, int batch, int height, int width, int channels) {
mace::testing::StopTiming();
OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C, TYPE) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, OPENCL);
BM_WINOGRAD_TRANSFORM(1, 16, 16, 128, half);
BM_WINOGRAD_TRANSFORM(1, 64, 64, 128, half);
BM_WINOGRAD_TRANSFORM(1, 128, 128, 128, half);
template <DeviceType D, typename T>
static void BMWinogradInverseTransform(
int iters, int batch, int height, int width, int channels) {
mace::testing::StopTiming();
index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2);
OpsTestNet net;
net.AddRandomInput<D, float>("Input", {16, channels, p, 1});
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
.Input("InputImage")
.AddIntArg("batch", batch)
.AddIntArg("height", height)
.AddIntArg("width", width)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C, TYPE) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, OPENCL);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32, half);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32, half);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32, half);
} // namespace mace
\ No newline at end of file
...@@ -10,15 +10,6 @@ licenses(["notice"]) # Apache 2.0 ...@@ -10,15 +10,6 @@ licenses(["notice"]) # Apache 2.0
load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
py_proto_library(
name = "mace_py",
srcs = ["mace.proto"],
default_runtime = "@com_google_protobuf//:protobuf_python",
protoc = "@com_google_protobuf//:protoc",
srcs_version = "PY2AND3",
deps = ["@com_google_protobuf//:protobuf_python"],
)
py_proto_library( py_proto_library(
name = "caffe_py", name = "caffe_py",
srcs = ["caffe.proto"], srcs = ["caffe.proto"],
......
...@@ -41,7 +41,7 @@ class Tuner { ...@@ -41,7 +41,7 @@ class Tuner {
template <typename RetType> template <typename RetType>
RetType TuneOrRun( RetType TuneOrRun(
const std::string param_key, const std::string param_key,
std::vector<param_type> &default_param, const std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()> const std::function<std::vector<std::vector<param_type>>()>
&param_generator, &param_generator,
const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func, const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册