提交 33415ee9 编写于 作者: 李寅

Return mace status for allocate

上级 ccaec70c
...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
} }
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(ws_->LoadModelTensor( MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
*net_def, device_type_, model_data)); *net_def, device_type_, model_data));
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
NetMode::INIT); NetMode::INIT);
MACE_FAILURE_RETURN(net->Run()); MACE_RETURN_IF_ERROR(net->Run());
net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions"); " please use 1 to fill missing dimensions");
Tensor *input_tensor = Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first)); ws_->GetTensor(MakeString("mace_input_node_", input.first));
input_tensor->Resize(input.second.shape()); MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{ {
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>(); float *input_data = input_tensor->mutable_data<float>();
...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]); hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]);
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(net_->Run(run_metadata)); MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
#endif #endif
......
...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
CallStats call_stats; CallStats call_stats;
if (future_wait) { if (future_wait) {
StatsFuture future; StatsFuture future;
MACE_FAILURE_RETURN(op->Run(&future)); MACE_RETURN_IF_ERROR(op->Run(&future));
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
future.wait_fn(&call_stats); future.wait_fn(&call_stats);
} else { } else {
...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} }
} else if (run_metadata != nullptr) { } else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros(); call_stats.start_micros = NowMicros();
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
call_stats.end_micros = NowMicros(); call_stats.end_micros = NowMicros();
} else { } else {
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
} }
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
......
...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} else { } else {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type))); new Buffer(GetDeviceAllocator(type)));
MaceStatus status = tensor_buffer_->Allocate(model_data_size); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
tensor_buffer_->Map(nullptr); tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data), tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size); 0, model_data_size);
...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() >= 20000) { if (mem_block.mem_id() >= 20000) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image()); new Image());
MaceStatus status = image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype); {mem_block.x(), mem_block.y()}, dtype));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} }
...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() < 20000) { if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type))); new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE); + MACE_EXTRA_BUFFER_PAD_SIZE));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
......
...@@ -40,7 +40,7 @@ struct AddNFunctor { ...@@ -40,7 +40,7 @@ struct AddNFunctor {
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensors[0])); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size(); index_t size = output_tensor->size();
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -21,73 +21,73 @@ namespace mace { ...@@ -21,73 +21,73 @@ namespace mace {
namespace kernels { namespace kernels {
void Conv2dNeonK1x1S1(const float *input, void Conv2dNeonK1x1S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
const index_t in_channels, const index_t in_channels,
const index_t out_channels, const index_t out_channels,
float *output); float *output);
void Conv2dNeonK3x3S1(const float *input, void Conv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK3x3S2(const float *input, void Conv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK5x5S1(const float *input, void Conv2dNeonK5x5S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK1x7S1(const float *input, void Conv2dNeonK1x7S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x1S1(const float *input, void Conv2dNeonK7x1S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S1(const float *input, void Conv2dNeonK7x7S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S2(const float *input, void Conv2dNeonK7x7S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S3(const float *input, void Conv2dNeonK7x7S3(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK1x15S1(const float *input, void Conv2dNeonK1x15S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK15x1S1(const float *input, void Conv2dNeonK15x1S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
// calculate one output channel and one input channel // calculate one output channel and one input channel
inline void Conv2dCPUKHxKWCalc(const float *in_ptr, inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
...@@ -99,13 +99,13 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr, ...@@ -99,13 +99,13 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
const index_t out_width, const index_t out_width,
float *out_ptr, float *out_ptr,
const int stride) { const int stride) {
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
for (int i = 0; i < filter_height; ++i) { for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) { for (int j = 0; j < filter_width; ++j) {
out_ptr[h * out_width + w] out_ptr[h * out_width + w] +=
+= in_ptr[(h * stride + i) * in_width + (w * stride + j)] in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
* filter_ptr[i * filter_width + j]; filter_ptr[i * filter_width + j];
} }
} }
} }
......
...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr, ...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr,
for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
for (int i = 0; i < 15; ++i) { for (int i = 0; i < 15; ++i) {
for (int j = 0; j < 1; ++j) { for (int j = 0; j < 1; ++j) {
out_ptr[io * out_image_size + ih * out_width + w + iw] out_ptr[io * out_image_size + ih * out_width + w + iw] +=
+= in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 1 + j]; filter_ptr[io * in_channels * 15 + i * 1 + j];
} }
} }
} }
} }
} }
// Ho = 4, Wo = 1, Co = 1 // Ho = 4, Wo = 1, Co = 1
void Conv2dNeonK15x1S1(const float *input, void Conv2dNeonK15x1S1(const float *input,
const float *filter, const float *filter,
...@@ -59,7 +58,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -59,7 +58,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
const index_t tile_width = const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
...@@ -147,16 +145,16 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -147,16 +145,16 @@ void Conv2dNeonK15x1S1(const float *input,
out_ptr_base[out_offset + 2 * out_width] = vo[2]; out_ptr_base[out_offset + 2 * out_width] = vo[2];
out_ptr_base[out_offset + 3 * out_width] = vo[3]; out_ptr_base[out_offset + 3 * out_width] = vo[3];
} // wt } // wt
} // h } // h
#else #else
Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels, Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, out_width, w, tile_width, out_height, out_width, w, tile_width,
out_image_size, out_ptr_base, 0, 1); out_image_size, out_ptr_base, 0, 1);
#endif #endif
} // c } // c
} // w } // w
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input, ...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input,
const index_t out_channels, const index_t out_channels,
float *output) { float *output) {
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
Gemm(filter, Gemm(filter, input + b * in_channels * height * width, 1, out_channels,
input + b * in_channels * height * width, in_channels, height * width,
1,
out_channels,
in_channels,
height * width,
output + b * out_channels * height * width); output + b * out_channels * height * width);
} }
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#endif #endif
#include "mace/kernels/arm/conv_2d_neon.h" #include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr, ...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr,
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 15; ++j) { for (int j = 0; j < 15; ++j) {
out_ptr[io * out_image_size + (h + ih) * out_width + iw] out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
+= in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 15 + j]; filter_ptr[io * in_channels * 15 + i * 15 + j];
} }
} }
} }
} }
} }
// Ho = 1, Wo = 4, Co = 1 // Ho = 1, Wo = 4, Co = 1
void Conv2dNeonK1x15S1(const float *input, void Conv2dNeonK1x15S1(const float *input,
const float *filter, const float *filter,
...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
...@@ -133,16 +131,16 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -133,16 +131,16 @@ void Conv2dNeonK1x15S1(const float *input,
vst1q_f32(out_ptr_base + out_offset, vo); vst1q_f32(out_ptr_base + out_offset, vo);
} // w } // w
} // ht } // ht
#else #else
Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels, Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, h, tile_height, out_width, out_height, h, tile_height, out_width,
out_image_size, out_ptr_base, 0, 1); out_image_size, out_ptr_base, 0, 1);
#endif #endif
} // c } // c
} // h } // h
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */ /* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -174,7 +170,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -174,7 +170,7 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr2_base + out_offset, vo2); vst1q_f32(out_ptr2_base + out_offset, vo2);
vst1q_f32(out_ptr3_base + out_offset, vo3); vst1q_f32(out_ptr3_base + out_offset, vo3);
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
...@@ -239,17 +235,16 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -239,17 +235,16 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
in_width, 1, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -45,7 +45,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -45,7 +45,7 @@ void Conv2dNeonK3x3S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
#endif #endif
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr0 = input + b * in_batch_size + c * in_image_size; const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
...@@ -54,11 +54,11 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -54,11 +54,11 @@ void Conv2dNeonK3x3S1(const float *input,
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1 = out_ptr1_base; float *out_ptr1 = out_ptr1_base;
const float *in_ptr1 = const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width; input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 = const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width; input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 = const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width; input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9; const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
#endif #endif
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf11 = vld1q_f32(filter_ptr1 + 3); vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6); vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -179,7 +178,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -179,7 +178,7 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width; out_ptr0 += out_width;
out_ptr1 += out_width; out_ptr1 += out_width;
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf167 = vld1_f32(filter_ptr1 + 6); vf167 = vld1_f32(filter_ptr1 + 6);
vf189 = vld1_f32(filter_ptr1 + 8); vf189 = vld1_f32(filter_ptr1 + 8);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -313,18 +311,18 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -313,18 +311,18 @@ void Conv2dNeonK3x3S1(const float *input,
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < out_channels; ++mm) {
float float *out_ptr0_base =
*out_ptr0_base = output + b * out_batch_size + mm * out_image_size; output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float const float *in_ptr0 =
*in_ptr0 = input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *in_ptr1 = const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width; input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 = const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width; input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 = const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width; input + b * in_batch_size + c * in_image_size + 3 * in_width;
#endif #endif
const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9; const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -411,7 +408,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -411,7 +408,7 @@ void Conv2dNeonK3x3S1(const float *input,
in_ptr3 += 2 + in_width; in_ptr3 += 2 + in_width;
out_ptr0 += out_width; out_ptr0 += out_width;
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -499,15 +495,14 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -499,15 +495,14 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width; out_ptr0 += out_width;
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
void Conv2dNeonK3x3S2(const float *input, void Conv2dNeonK3x3S2(const float *input,
...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t out_height = out_shape[2]; const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
*filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size; float *out_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...@@ -569,8 +563,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -569,8 +563,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -591,8 +585,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -591,8 +585,8 @@ void Conv2dNeonK3x3S2(const float *input,
vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
vst1q_f32(out_base + out_offset, vo); vst1q_f32(out_base + out_offset, vo);
} // w } // w
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
// load filter (1 outch x 3 height x 3 width): vf_outch_height // load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x2_t vf01, vf23, vf45, vf67, vf78; float32x2_t vf01, vf23, vf45, vf67, vf78;
...@@ -631,8 +625,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -631,8 +625,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -654,15 +648,14 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -654,15 +648,14 @@ void Conv2dNeonK3x3S2(const float *input,
vst1q_f32(out_base + out_offset, vo); vst1q_f32(out_base + out_offset, vo);
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_base, filter_ptr, Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_base, 2);
out_base, 2);
#endif #endif
} // c } // c
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -21,59 +21,59 @@ ...@@ -21,59 +21,59 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \ #define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \ float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \ float32x2_t vf01, vf11, vf21, vf31; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \ vf01 = vld1_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \ vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1_f32(filter_ptr1 + 3); \ vf11 = vld1_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \ vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1_f32(filter_ptr2 + 3); \ vf21 = vld1_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \ vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1_f32(filter_ptr3 + 3); \ vf31 = vld1_f32(filter_ptr3 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \
\ \
/* outch 1 */ \ /* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \ vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \
\ \
/* outch 2 */ \ /* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \ vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \
\ \
/* outch 3 */ \ /* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define MACE_Conv2dNeonK5x5SnLoadCalc1 \ #define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \ float32x4_t vf00; \
float32x2_t vf01; \ float32x2_t vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \ vf01 = vld1_f32(filter_ptr0 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
...@@ -99,7 +99,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -99,7 +99,7 @@ void Conv2dNeonK5x5S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size; output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
...@@ -118,7 +118,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -118,7 +118,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter + (m + 3) * in_channels * 25 + c * 25; filter + (m + 3) * in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
...@@ -157,7 +157,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -157,7 +157,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter_ptr2 -= 25; filter_ptr2 -= 25;
filter_ptr3 -= 25; filter_ptr3 -= 25;
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
...@@ -203,17 +203,16 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -203,17 +203,16 @@ void Conv2dNeonK5x5S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 25; filter_ptr0 -= 25;
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
in_width, 5, 5, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 4 height x 1 width) */ /* load filter (4 outch x 4 height x 1 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width], out_ptr3_base[out_offset + 2 * out_width],
out_ptr3_base[out_offset + 3 * out_width]}; out_ptr3_base[out_offset + 3 * out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// input (3 slide) // input (3 slide)
...@@ -203,7 +198,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -203,7 +198,7 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
...@@ -280,17 +275,16 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -280,17 +275,16 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
in_width, 7, 1, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -21,136 +21,136 @@ ...@@ -21,136 +21,136 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \ float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \ float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \ float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \ vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \ vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \ vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \ vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \ vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \ vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \ vf31 = vld1q_f32(filter_ptr3 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \ vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); \ vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); \
\ \
/* outch 1 */ \ /* outch 1 */ \
vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \ vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \
vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \ vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \ vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \ vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); \ vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); \ vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); \ vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); \
\ \
/* outch 2 */ \ /* outch 2 */ \
vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \ vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \
vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \ vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \ vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \ vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); \ vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); \ vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); \ vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); \
\ \
/* outch 3 */ \ /* outch 3 */ \
vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \ vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \
vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \ vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \ vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); \ vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \ #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \ vf01 = vld1q_f32(filter_ptr0 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \ vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \ vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \ #define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \ float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \ float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \ float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \ vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \ vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \ vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \ vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \ vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \ vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \ vf31 = vld1q_f32(filter_ptr3 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); \ vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); \
\ \
/* outch 1 */ \ /* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); \ vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); \
vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); \ vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); \
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); \ vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); \
\ \
/* outch 2 */ \ /* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); \ vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); \
vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); \ vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); \
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); \ vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); \
\ \
/* outch 3 */ \ /* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); \ vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); \
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \ vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \ #define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \ vf01 = vld1q_f32(filter_ptr0 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
...@@ -176,7 +176,7 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -176,7 +176,7 @@ void Conv2dNeonK7x7S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size; output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
...@@ -195,7 +195,7 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -195,7 +195,7 @@ void Conv2dNeonK7x7S1(const float *input,
filter + (m + 3) * in_channels * 49 + c * 49; filter + (m + 3) * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
...@@ -242,7 +242,7 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -242,7 +242,7 @@ void Conv2dNeonK7x7S1(const float *input,
filter_ptr2 -= 49; filter_ptr2 -= 49;
filter_ptr3 -= 49; filter_ptr3 -= 49;
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
...@@ -296,17 +296,16 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -296,17 +296,16 @@ void Conv2dNeonK7x7S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49; filter_ptr0 -= 49;
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
...@@ -372,8 +371,8 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -372,8 +371,8 @@ void Conv2dNeonK7x7S2(const float *input,
vvi0 = vld2q_f32(in_ptr_base + in_offset); vvi0 = vld2q_f32(in_ptr_base + in_offset);
// [8.10.12.14, 9.11.13.15] // [8.10.12.14, 9.11.13.15]
vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
vi0 = vvi0.val[0]; // [0.2.4.6] vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7] vi1 = vvi0.val[1]; // [1.3.5.7]
vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8]
vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9]
vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10]
...@@ -381,9 +380,9 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -381,9 +380,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__) #if defined(__aarch64__)
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else #else
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -403,7 +402,7 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -403,7 +402,7 @@ void Conv2dNeonK7x7S2(const float *input,
filter_ptr2 -= 49; filter_ptr2 -= 49;
filter_ptr3 -= 49; filter_ptr3 -= 49;
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
...@@ -441,8 +440,8 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -441,8 +440,8 @@ void Conv2dNeonK7x7S2(const float *input,
vvi0 = vld2q_f32(in_ptr_base + in_offset); vvi0 = vld2q_f32(in_ptr_base + in_offset);
// [8.10.12.14, 9.11.13.15] // [8.10.12.14, 9.11.13.15]
vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
vi0 = vvi0.val[0]; // [0.2.4.6] vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7] vi1 = vvi0.val[1]; // [1.3.5.7]
vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8]
vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9]
vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10]
...@@ -462,17 +461,16 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -462,17 +461,16 @@ void Conv2dNeonK7x7S2(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49; filter_ptr0 -= 49;
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 2);
out_ptr0_base, 2);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
...@@ -538,18 +536,18 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -538,18 +536,18 @@ void Conv2dNeonK7x7S3(const float *input,
vvi0 = vld3q_f32(in_ptr_base + in_offset); vvi0 = vld3q_f32(in_ptr_base + in_offset);
// [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
vi0 = vvi0.val[0]; // [0.3.6.9] vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10] vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11] vi2 = vvi0.val[2]; // [2.5.8.11]
vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12]
vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13]
vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14]
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__) #if defined(__aarch64__)
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else #else
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -569,7 +567,7 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -569,7 +567,7 @@ void Conv2dNeonK7x7S3(const float *input,
filter_ptr2 -= 49; filter_ptr2 -= 49;
filter_ptr3 -= 49; filter_ptr3 -= 49;
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
...@@ -607,9 +605,9 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -607,9 +605,9 @@ void Conv2dNeonK7x7S3(const float *input,
vvi0 = vld3q_f32(in_ptr_base + in_offset); vvi0 = vld3q_f32(in_ptr_base + in_offset);
// [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
vi0 = vvi0.val[0]; // [0.3.6.9] vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10] vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11] vi2 = vvi0.val[2]; // [2.5.8.11]
vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12]
vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13]
vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14]
...@@ -628,17 +626,16 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -628,17 +626,16 @@ void Conv2dNeonK7x7S3(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49; filter_ptr0 -= 49;
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 3);
out_ptr0_base, 3);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/kernels/arm/conv_winograd.h" #include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -44,14 +44,13 @@ void TransformInput4x4(const float *input, ...@@ -44,14 +44,13 @@ void TransformInput4x4(const float *input,
for (index_t h = 0; h < in_height - 2; h += 2) { for (index_t h = 0; h < in_height - 2; h += 2) {
for (index_t w = 0; w < in_width - 2; w += 2) { for (index_t w = 0; w < in_width - 2; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15; d15;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15; s15;
// load tile data // load tile data
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
d0 = input_ptr[0]; d0 = input_ptr[0];
d1 = input_ptr[1]; d1 = input_ptr[1];
d2 = input_ptr[2]; d2 = input_ptr[2];
...@@ -92,7 +91,7 @@ void TransformInput4x4(const float *input, ...@@ -92,7 +91,7 @@ void TransformInput4x4(const float *input,
// store output // store output
float *output_ptr = float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index; output + n * output_batch_size + c * tile_count + tile_index;
output_ptr[0] = s0; output_ptr[0] = s0;
output_ptr[1 * stride] = s1; output_ptr[1 * stride] = s1;
output_ptr[2 * stride] = s2; output_ptr[2 * stride] = s2;
...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input, ...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input,
float s[8][8]; float s[8][8];
for (index_t h = 0; h < in_height - 2; h += 6) { for (index_t h = 0; h < in_height - 2; h += 6) {
for (index_t w = 0; w < in_width - 2; w += 6) { for (index_t w = 0; w < in_width - 2; w += 6) {
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -203,7 +201,7 @@ void TransformInput8x8(const float *input, ...@@ -203,7 +201,7 @@ void TransformInput8x8(const float *input,
} }
float *output_ptr = float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index; output + n * output_batch_size + c * tile_count + tile_index;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
d0 = s[0][i]; d0 = s[0][i];
...@@ -258,27 +256,18 @@ void BatchGemm(const float *input, ...@@ -258,27 +256,18 @@ void BatchGemm(const float *input,
const index_t out_stride = out_channels * tile_count; const index_t out_stride = out_channels * tile_count;
if (batch == 1) { if (batch == 1) {
Gemm(filter, Gemm(filter, input, in_tile_area, out_channels, in_channels, tile_count,
input,
in_tile_area,
out_channels,
in_channels,
tile_count,
output); output);
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int i = 0; i < in_tile_area; ++i) { for (int i = 0; i < in_tile_area; ++i) {
const float const float *in_ptr = input + b * in_batch_size + i * in_stride;
*in_ptr = input + b * in_batch_size + i * in_stride;
const float *filter_ptr = filter + i * filter_stride; const float *filter_ptr = filter + i * filter_stride;
float *out_ptr = output + b * out_batch_size + i * out_stride; float *out_ptr = output + b * out_batch_size + i * out_stride;
Gemm(filter_ptr, Gemm(filter_ptr, in_ptr, 1, out_channels, /* rows */
in_ptr, in_channels, /* K */
1, tile_count, /* cols */
out_channels, /* rows */
in_channels, /* K */
tile_count, /* cols */
out_ptr); out_ptr);
} }
} }
...@@ -305,12 +294,12 @@ void TransformOutput4x4(const float *input, ...@@ -305,12 +294,12 @@ void TransformOutput4x4(const float *input,
for (index_t h = 0; h < out_height; h += 2) { for (index_t h = 0; h < out_height; h += 2) {
for (index_t w = 0; w < out_width; w += 2) { for (index_t w = 0; w < out_width; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15; d15;
float s0, s1, s2, s3, s4, s5, s6, s7; float s0, s1, s2, s3, s4, s5, s6, s7;
float v0, v1, v2, v3; float v0, v1, v2, v3;
const float *input_ptr = const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset; input + n * input_batch_size + m * tile_count + tile_offset;
d0 = input_ptr[0]; d0 = input_ptr[0];
d1 = input_ptr[1 * stride]; d1 = input_ptr[1 * stride];
d2 = input_ptr[2 * stride]; d2 = input_ptr[2 * stride];
...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input, ...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input,
v2 = s2 - s4 - s6; v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7; v3 = s3 - s5 - s7;
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
output_ptr[0] = v0; output_ptr[0] = v0;
output_ptr[1] = v1; output_ptr[1] = v1;
output_ptr[out_width] = v2; output_ptr[out_width] = v2;
...@@ -403,7 +391,7 @@ void TransformOutput8x8(const float *input, ...@@ -403,7 +391,7 @@ void TransformOutput8x8(const float *input,
for (index_t h = 0; h < out_height; h += 6) { for (index_t h = 0; h < out_height; h += 6) {
for (index_t w = 0; w < out_width; w += 6) { for (index_t w = 0; w < out_width; w += 6) {
const float *input_ptr = const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset; input + n * input_batch_size + m * tile_count + tile_offset;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input, ...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input,
input_ptr += 8 * stride; input_ptr += 8 * stride;
} }
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input, ...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input,
} }
} // namespace } // namespace
// OCHW => TOC // OCHW => TOC
// no need to optimize, it will exist in converter // no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter, void TransformFilter4x4(const float *filter,
...@@ -485,7 +471,7 @@ void TransformFilter4x4(const float *filter, ...@@ -485,7 +471,7 @@ void TransformFilter4x4(const float *filter,
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8; float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15; s15;
// load filter // load filter
index_t filter_offset = (m * in_channels + c) * 9; index_t filter_offset = (m * in_channels + c) * 9;
...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter, ...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter,
float *output) { float *output) {
const index_t stride = out_channels * in_channels; const index_t stride = out_channels * in_channels;
const float G[8][3] = { const float G[8][3] = {{1.0f, 0.0f, 0.0f},
{1.0f, 0.0f, 0.0f}, {-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, -2.0f / 9, -2.0f / 9}, {-2.0f / 9, 2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9}, {1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, 1.0f / 45, 2.0f / 45}, {1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180}, {0.0f, 0.0f, 1.0f}};
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
...@@ -612,7 +596,7 @@ void TransformFilter8x8(const float *filter, ...@@ -612,7 +596,7 @@ void TransformFilter8x8(const float *filter,
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) { for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] = output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
} }
} }
} }
...@@ -633,62 +617,38 @@ void WinoGradConv3x3s1(const float *input, ...@@ -633,62 +617,38 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
index_t tile_height_count = index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count = index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count; index_t tile_count = tile_height_count * tile_width_count;
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformInput4x4(input, TransformInput4x4(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
case 6: case 6:
TransformInput8x8(input, TransformInput8x8(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
BatchGemm(transformed_input, BatchGemm(transformed_input, transformed_filter, batch, in_channels,
transformed_filter, out_channels, tile_count, out_tile_size, transformed_output);
batch,
in_channels,
out_channels,
tile_count,
out_tile_size,
transformed_output);
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformOutput4x4(transformed_output, TransformOutput4x4(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
case 6: case 6:
TransformOutput8x8(transformed_output, TransformOutput8x8(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
} }
...@@ -704,52 +664,39 @@ void WinoGradConv3x3s1(const float *input, ...@@ -704,52 +664,39 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
index_t tile_height_count = index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count = index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count; index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
index_t transformed_input_size = index_t transformed_input_size =
in_tile_area * batch * in_channels * tile_count; in_tile_area * batch * in_channels * tile_count;
index_t transformed_filter_size = in_tile_area * out_channels * in_channels; index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
index_t index_t transformed_output_size =
transformed_output_size = in_tile_area * batch * out_channels * tile_count; in_tile_area * batch * out_channels * tile_count;
float *transformed_input = new float[transformed_input_size]; // TNCB float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_filter = new float[transformed_filter_size]; // TOC float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size]; float *transformed_output = new float[transformed_output_size];
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformFilter4x4(filter, TransformFilter4x4(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
case 6: case 6:
TransformFilter8x8(filter, TransformFilter8x8(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
WinoGradConv3x3s1(input, WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width,
transformed_filter, in_channels, out_channels, out_tile_size, transformed_input,
batch, transformed_output, output);
in_height,
in_width, delete[] transformed_input;
in_channels, delete[] transformed_filter;
out_channels, delete[] transformed_output;
out_tile_size,
transformed_input,
transformed_output,
output);
delete[]transformed_input;
delete[]transformed_filter;
delete[]transformed_output;
} }
void ConvRef3x3s1(const float *input, void ConvRef3x3s1(const float *input,
...@@ -769,7 +716,7 @@ void ConvRef3x3s1(const float *input, ...@@ -769,7 +716,7 @@ void ConvRef3x3s1(const float *input,
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
index_t out_offset = index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w; ((b * out_channels + m) * out_height + h) * out_width + w;
output[out_offset] = 0; output[out_offset] = 0;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
for (index_t kh = 0; kh < 3; ++kh) { for (index_t kh = 0; kh < 3; ++kh) {
...@@ -777,11 +724,10 @@ void ConvRef3x3s1(const float *input, ...@@ -777,11 +724,10 @@ void ConvRef3x3s1(const float *input,
index_t ih = h + kh; index_t ih = h + kh;
index_t iw = w + kw; index_t iw = w + kw;
index_t in_offset = index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw; ((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t index_t filter_offset =
filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; (((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] += output[out_offset] += input[in_offset] * filter[filter_offset];
input[in_offset] * filter[filter_offset];
} }
} }
} }
......
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <random>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <random>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/arm/conv_winograd.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) { ...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
std::generate(input_data, input_data + input_size, std::generate(input_data, input_data + input_size, [&gen, &nd] {
[&gen, &nd] { return std::max(-1.0f, std::min(1.0f, nd(gen)));
return std::max(-1.0f, std::min(1.0f, nd(gen))); });
}); std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
std::generate(filter_data, filter_data + filter_size, return std::max(-1.0f, std::min(1.0f, nd(gen)));
[&gen, &nd] { });
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
kernels::ConvRef3x3s1(input_data, kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
filter_data, in_channels, out_channels, output_data_ref);
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data_ref);
kernels::WinoGradConv3x3s1(input_data, kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
filter_data, in_width, in_channels, out_channels, 6,
batch,
in_height,
in_width,
in_channels,
out_channels,
6,
output_data); output_data);
// test // test
......
...@@ -32,15 +32,15 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -32,15 +32,15 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
float *output); float *output);
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
const int *pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
const index_t valid_w_stop, const index_t valid_w_stop,
float *output); float *output);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, ...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1 // Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_shape[3]; ++iw) { for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih - pad_top,
filter_ptr, iw - pad_left, out_width, in_height, in_width, 3,
ih, 3, out_base);
iw,
ih - pad_top,
iw - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -275,29 +221,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -275,29 +221,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
} // b } // b
} }
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
for (h = valid_h_start; h < valid_h_stop; ++h) { for (h = valid_h_start; h < valid_h_stop; ++h) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -397,8 +316,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -397,8 +316,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih * 2 - pad_top,
filter_ptr, iw * 2 - pad_left, out_width, in_height,
ih, in_width, 3, 3, out_base);
iw,
ih * 2 - pad_top,
iw * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -471,22 +372,13 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -471,22 +372,13 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor { ...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
outer_sizes[i] = input->size() / inner_size; outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t channels = output->dim(1);
...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if (is_filter_transformed_) { if (is_filter_transformed_) {
transformed_filter_ptr = filter_data; transformed_filter_ptr = filter_data;
} else { } else {
MACE_FAILURE_RETURN(transformed_filter_.Resize( MACE_RETURN_IF_ERROR(transformed_filter_.Resize(
transformed_filter_shape)); transformed_filter_shape));
switch (winograd_out_tile_size) { switch (winograd_out_tile_size) {
case 2: case 2:
...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const Tensor *pad_input_ptr = input; const Tensor *pad_input_ptr = input;
if (extra_input_height != input_height if (extra_input_height != input_height
|| extra_input_width != input_width) { || extra_input_width != input_width) {
MACE_FAILURE_RETURN(ConstructNCHWInputWithSpecificPadding(input, MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input,
pad_top, pad_top,
pad_bottom, pad_bottom,
pad_left, pad_left,
......
...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, ...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, ...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const int pad_width = pad_left + pad_right; const int pad_width = pad_left + pad_right;
std::vector<index_t> output_shape( std::vector<index_t> output_shape(
{batch, channels, height + pad_height, width + pad_width}); {batch, channels, height + pad_height, width + pad_width});
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
output_tensor->Clear(); output_tensor->Clear();
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, ...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, padding_type_, strides_, padding_type_,
output_shape.data(), output_shape.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, strides_,
output_shape_.data(), output_shape_.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape_)); MACE_RETURN_IF_ERROR(output->Resize(output_shape_));
} }
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
......
...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor { ...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor {
std::vector<index_t> output_shape = {batch_size, output_depth, std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width}; output_height, output_width};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
output->Clear(); output->Clear();
index_t batch = output->dim(0); index_t batch = output->dim(0);
......
...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase { ...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
} }
} }
} }
MACE_FAILURE_RETURN(output->ResizeLike(input0)); MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0); Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { ...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1}; std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t N = output->dim(0); const index_t N = output->dim(0);
const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
const index_t output_size = weight->dim(0); const index_t output_size = weight->dim(0);
......
...@@ -44,7 +44,7 @@ struct MatMulFunctor { ...@@ -44,7 +44,7 @@ struct MatMulFunctor {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
MACE_FAILURE_RETURN(C->Resize(c_shape)); MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A); Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B); Tensor::MappingGuard guardb(B);
......
...@@ -21,12 +21,12 @@ ...@@ -21,12 +21,12 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
T>::operator()(const Tensor *input, const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
const index_t width = input->dim(2); const index_t width = input->dim(2);
...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
switch (activation_) { switch (activation_) {
case RELU:tuning_key_prefix_ = "relu_opencl_kernel"; case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:tuning_key_prefix_ = "relux_opencl_kernel"; case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU:tuning_key_prefix_ = "prelu_opencl_kernel"; case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH:tuning_key_prefix_ = "tanh_opencl_kernel"; case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID:tuning_key_prefix_ = "sigmoid_opencl_kernel"; case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation_; default:
LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ActivationFunctor<DeviceType::GPU, float>;
struct ActivationFunctor<DeviceType::GPU, float>; template struct ActivationFunctor<DeviceType::GPU, half>;
template
struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -71,7 +71,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -71,7 +71,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
...@@ -87,13 +87,13 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,13 +87,13 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -23,14 +23,15 @@ namespace mace { ...@@ -23,14 +23,15 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *scale, const Tensor *input,
const Tensor *offset, const Tensor *scale,
const Tensor *mean, const Tensor *offset,
const Tensor *var, const Tensor *mean,
const float epsilon, const Tensor *var,
Tensor *output, const float epsilon,
StatsFuture *future) { Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -96,7 +97,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -96,7 +97,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -23,9 +23,9 @@ namespace kernels { ...@@ -23,9 +23,9 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
const index_t width = input->dim(2); const index_t width = input->dim(2);
...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -67,7 +67,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -67,7 +67,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) { for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
roundup_gws[i] = RoundUp(gws[i], lws[i]);
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
......
...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *image, Tensor *image,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape); CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type); std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
MACE_FAILURE_RETURN(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_FAILURE_RETURN(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
} }
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -107,7 +106,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,7 +106,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -16,18 +16,16 @@ ...@@ -16,18 +16,16 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor *output, MACE_RETURN_IF_ERROR(output->ResizeLike(input));
StatsFuture *future) {
MACE_FAILURE_RETURN(output->ResizeLike(input));
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const index_t channels_per_group = channels / groups_; const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0, MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4"); "channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
"groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ =
built_options); runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -76,7 +73,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -76,7 +73,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
struct ChannelShuffleFunctor<DeviceType::GPU, float>; template struct ChannelShuffleFunctor<DeviceType::GPU, half>;
template
struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,11 +22,9 @@ namespace mace { ...@@ -22,11 +22,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
...@@ -37,16 +35,15 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -37,16 +35,15 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static MaceStatus Concat2(cl::Kernel *kernel,
static void Concat2(cl::Kernel *kernel, const Tensor *input0,
const Tensor *input0, const Tensor *input1,
const Tensor *input1, const DataType dt,
const DataType dt, std::vector<index_t> *prev_input_shape,
std::vector<index_t> *prev_input_shape, Tensor *output,
Tensor *output, StatsFuture *future,
StatsFuture *future, uint32_t *kwg_size,
uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error) {
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -67,8 +64,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -67,8 +64,8 @@ static void Concat2(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -95,7 +92,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -95,7 +92,7 @@ static void Concat2(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,15 +122,17 @@ static void Concat2(cl::Kernel *kernel, ...@@ -125,15 +122,17 @@ static void Concat2(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
static void ConcatN(cl::Kernel *kernel, static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -179,7 +178,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -179,7 +178,7 @@ static void ConcatN(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()( ...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, return Concat2(&kernel_, input_list[0], input_list[1],
&input_shape_, output, future, &kwg_size_, &kernel_error_); DataTypeToEnum<T>::value, &input_shape_, output, future,
break; &kwg_size_, &kernel_error_);
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future, return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
&kwg_size_, &kernel_error_); future, &kwg_size_, &kernel_error_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -18,61 +18,61 @@ ...@@ -18,61 +18,61 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
template <typename T> template <typename T>
MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)( typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), return conv2d_func(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), return Conv2dOpencl(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} }
return MACE_SUCCESS;
} }
template struct Conv2dFunctor<DeviceType::GPU, float>; template struct Conv2dFunctor<DeviceType::GPU, float>;
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -25,11 +25,9 @@ namespace { ...@@ -25,11 +25,9 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value. // TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128; const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
} }
...@@ -57,21 +54,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -57,21 +54,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_UNUSED(padding); MACE_UNUSED(padding);
MACE_UNUSED(dilations); MACE_UNUSED(dilations);
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -145,7 +142,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -145,7 +142,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -24,22 +24,20 @@ namespace kernels { ...@@ -24,22 +24,20 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>( uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1); OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, const uint32_t base =
4); std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base), lws[0] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size / RoundUp<uint32_t>(
lws_size / compute_units, base), cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
...@@ -50,21 +48,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -50,21 +48,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -129,7 +127,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -129,7 +127,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size, const uint32_t kernel_size,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -41,10 +40,10 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -41,10 +40,10 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units) lws_size / compute_units) *
* 8, 8,
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
if (gws[2] < lws_limit) { if (gws[2] < lws_limit) {
lws[2] = gws[2]; lws[2] = gws[2];
...@@ -58,21 +57,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -58,21 +57,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -137,7 +136,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -137,7 +136,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -168,11 +167,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -168,11 +167,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel", output->dim(0), Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3), output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -20,20 +20,20 @@ namespace kernels { ...@@ -20,20 +20,20 @@ namespace kernels {
namespace { namespace {
void Deconv2dOpencl(cl::Kernel *kernel, MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *paddings, const int *paddings,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
#define MACE_WIDTH_BLK 5 #define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride; const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks = const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride; ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride); const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1; const int padding_h = (paddings[0] + 1) >> 1;
const int padding_w = (paddings[0]+1) >> 1; const int padding_w = (paddings[0] + 1) >> 1;
const int align_h = stride - 1 - padding_h; const int align_h = stride - 1 - padding_h;
const int align_w = stride - 1 - padding_w; const int align_w = stride - 1 - padding_w;
...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel,
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP:break; case NOOP:
case RELU:built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:built_options.emplace("-DUSE_RELUX"); case RELU:
built_options.emplace("-DUSE_RELU");
break; break;
case TANH:built_options.emplace("-DUSE_TANH"); case RELUX:
built_options.emplace("-DUSE_RELUX");
break; break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID"); case TANH:
built_options.emplace("-DUSE_TANH");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation; case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
...@@ -150,16 +156,19 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -150,16 +156,19 @@ void Deconv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
template <typename T> template <typename T>
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *filter, const Tensor *input,
const Tensor *bias, const Tensor *filter,
Tensor *output, const Tensor *bias,
StatsFuture *future) { Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (output_shape_.size() == 4) { if (output_shape_.size() == 4) {
paddings_.clear(); paddings_.clear();
paddings_ = std::vector<int>(2, 0); paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize( CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
input->shape().data(), strides_, padding_type_, output_shape_.data(),
filter->shape().data(), paddings_.data());
strides_, padding_type_,
output_shape_.data(),
paddings_.data());
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(), CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
filter->shape().data(), strides_, output_shape_.data(), paddings_.data());
strides_,
output_shape_.data(),
paddings_.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape_, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape_, output_image_shape));
Deconv2dOpencl(&kernel_, input, filter, bias, return Deconv2dOpencl(&kernel_, input, filter, bias, strides_[0],
strides_[0], paddings_.data(), paddings_.data(), activation_, relux_max_limit_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_, output, future,
DataTypeToEnum<T>::value, &input_shape_, &kwg_size_, &kernel_error_);
output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
} }
template struct Deconv2dFunctor<DeviceType::GPU, float>; template struct Deconv2dFunctor<DeviceType::GPU, float>;
......
...@@ -40,7 +40,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,7 +40,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width * block_size_; output_width = input_width * block_size_;
output_depth = input_depth / (block_size_ * block_size_); output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:") MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth; << output_depth;
kernel_name = "depth_to_space"; kernel_name = "depth_to_space";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
...@@ -53,7 +53,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -53,7 +53,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width / block_size_; output_width = input_width / block_size_;
output_depth = input_depth * block_size_ * block_size_; output_depth = input_depth * block_size_ * block_size_;
MACE_CHECK(input_depth % 4 == 0, "input channel not support:") MACE_CHECK(input_depth % 4 == 0, "input channel not support:")
<< input_depth; << input_depth;
kernel_name = "space_to_depth"; kernel_name = "space_to_depth";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
runtime->BuildKernel("depth_to_space", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -107,7 +106,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,7 +106,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -24,8 +24,7 @@ namespace kernels { ...@@ -24,8 +24,7 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize; uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
...@@ -40,9 +39,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -40,9 +39,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
} }
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
(cache_size / kernel_cache_size / lws_size) * 4, gws[2]);
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = gws[2]; lws[2] = gws[2];
} }
...@@ -52,21 +50,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -52,21 +50,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static void DepthwiseConv2d(cl::Kernel *kernel, static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *paddings, const int *paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -149,7 +147,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -149,7 +147,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", std::string tuning_key =
gws[0], gws[1], gws[2], multiplier); Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) { if (strides_[0] != strides_[1]) {
...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), return DepthwiseConv2d(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
} }
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>; template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
......
...@@ -22,16 +22,15 @@ namespace kernels { ...@@ -22,16 +22,15 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
bool swapped = false; bool swapped = false;
if (input1 != nullptr) { if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size() MACE_CHECK(input0->dim_size() == input1->dim_size() ||
|| input0->dim_size() == 1 input0->dim_size() == 1 || input1->dim_size() == 1)
|| input1->dim_size() == 1) << "Inputs of Eltwise op must be same shape";
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) { if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) { if (input0->size() < input1->size()) {
std::swap(input0, input1); std::swap(input0, input1);
...@@ -39,28 +38,26 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -39,28 +38,26 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
} }
if (input1->dim_size() == 1) { if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(3) == input1->dim(0)) MACE_CHECK(input0->dim(3) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast"; << "Element-Wise op only support channel dimension broadcast";
} else { } else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) && MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(1) == 1 && input1->dim(2) == 1)
input1->dim(2) == 1) << "Element-Wise op only support channel dimension broadcast";
<< "Element-Wise op only support channel dimension broadcast";
} }
} }
} }
std::vector<index_t > output_shape(4); std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0); output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1); output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2); output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3); output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -115,7 +112,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -115,7 +112,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -20,18 +20,18 @@ namespace kernels { ...@@ -20,18 +20,18 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T>
void FCWXKernel(cl::Kernel *kernel, MaceStatus FCWXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future, StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -115,7 +115,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -115,7 +115,7 @@ void FCWXKernel(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
...@@ -170,21 +170,23 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -170,21 +170,23 @@ void FCWXKernel(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
void FCWTXKernel(cl::Kernel *kernel, MaceStatus FCWTXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future, StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t kwg_size = uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
*lws = {16, kwg_size/16, 0}; *lws = {16, kwg_size / 16, 0};
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -246,7 +248,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -246,7 +248,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(1), output->dim(2), output->dim(3)); output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( ...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
return MACE_SUCCESS; return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
} }
template struct FullyConnectedFunctor<DeviceType::GPU, float>; template struct FullyConnectedFunctor<DeviceType::GPU, float>;
......
...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size); lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws; return lws;
...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 4) MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+3); std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3]; uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<std::vector<uint32_t>> results; std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = { std::vector<std::vector<uint32_t>> candidates = {
{kwg_size / 2, 2, 0}, {kwg_size / 2, 2, 0}, {kwg_size / 4, 4, 0},
{kwg_size / 4, 4, 0}, {kwg_size / 8, 8, 0}, {kwg_size / 16, 16, 0},
{kwg_size / 8, 8, 0}, {kwg_size / 32, 32, 0}, {kwg_size / 64, 64, 0},
{kwg_size / 16, 16, 0}, {kwg_size / 128, 128, 0}, {kwg_size / 256, 256, 0},
{kwg_size / 32, 32, 0}, {kwg_size, 1, 0}, {1, kwg_size, 0}};
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}
};
for (auto &ele : candidates) { for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2]; const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) { if (0 < tmp && tmp <= kwg_size) {
...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 3) MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+2); std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2]; uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -88,10 +88,9 @@ inline bool LimitKernelTime() { ...@@ -88,10 +88,9 @@ inline bool LimitKernelTime() {
} }
template <typename T> template <typename T>
bool IsVecEqual(const std::vector<T> &input0, bool IsVecEqual(const std::vector<T> &input0, const std::vector<T> &input1) {
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) && return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin()))); (std::equal(input0.begin(), input0.end(), input1.begin())));
} }
template <typename T> template <typename T>
......
...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *buffer, Tensor *buffer,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape); CalImage2DShape(image->shape(), type, &image_shape);
MACE_FAILURE_RETURN(buffer->Resize(image->shape())); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -100,7 +99,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +99,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
} }
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -22,14 +22,14 @@ namespace kernels { ...@@ -22,14 +22,14 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_FAILURE_RETURN(C->ResizeImage(c_shape, c_image_shape)); MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t batch = C->dim(0); const index_t batch = C->dim(0);
const index_t height = C->dim(1); const index_t height = C->dim(1);
...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -71,7 +71,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -71,7 +71,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
Concat("matmul_opencl_kernel", C->dim(0), C->dim(2), C->dim(3));
C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>( kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0; *(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap(); kernel_error->UnMap();
...@@ -70,7 +70,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -70,7 +70,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer()))); *(static_cast<cl::Buffer *>(kernel_error->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
bool is_out_of_range = false; bool is_out_of_range = false;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
is_out_of_range = is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false;
*(kernel_error->mutable_data<char>()) == 1 ? true : false;
kernel_error->UnMap(); kernel_error->UnMap();
} }
return is_out_of_range; return is_out_of_range;
...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test { class OutOfRangeCheckTest : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() { setenv("OUT_OF_RANGE_CHECK", "1", 1); }
setenv("OUT_OF_RANGE_CHECK", "1", 1);
}
}; };
TEST(OutOfRangeCheckTest, RandomTest) { TEST(OutOfRangeCheckTest, RandomTest) {
...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) { ...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels}; std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws; Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer", Tensor *buffer =
GetDeviceAllocator(DeviceType::GPU), ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape); buffer->Resize(buffer_shape);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
......
...@@ -20,26 +20,25 @@ ...@@ -20,26 +20,25 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *input, Tensor *output,
Tensor *output, StatsFuture *future) {
StatsFuture *future) { MACE_CHECK(this->paddings_.size() ==
MACE_CHECK( static_cast<size_t>((input->dim_size() * 2)));
this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2))); MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
&& (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) << "Mace only support height/width dimension now";
<< "Mace only support height/width dimension now";
auto input_shape = input->shape(); auto input_shape = input->shape();
std::vector<index_t> std::vector<index_t> output_shape = {
output_shape = {input_shape[0] + this->paddings_[0] + this->paddings_[1], input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3], input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5], input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]}; input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(2), output->dim(3));
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct PadFunctor<DeviceType::GPU, float>;
struct PadFunctor<DeviceType::GPU, float>; template struct PadFunctor<DeviceType::GPU, half>;
template
struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,15 +23,13 @@ namespace kernels { ...@@ -23,15 +23,13 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4; lws[0] = gws[0] / 4;
if (lws[0] == 0) { if (lws[0] == 0) {
...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T> template <typename T>
MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t out_height = output->dim(1); index_t out_height = output->dim(1);
...@@ -125,7 +123,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -125,7 +123,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -23,11 +23,9 @@ namespace mace { ...@@ -23,11 +23,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
...@@ -110,7 +108,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -110,7 +108,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
...@@ -29,14 +29,15 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -29,14 +29,15 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const size_t outputs_count = output_list.size(); const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count; const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0) MACE_CHECK(output_channels % 4 == 0)
<< "output channels of slice op must be divisible by 4"; << "output channels of slice op must be divisible by 4";
std::vector<index_t> output_shape({input->dim(0), input->dim(1), std::vector<index_t> output_shape(
input->dim(2), output_channels}); {input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i = 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE=" +
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blk = RoundUpDiv4(output_channels); const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
...@@ -80,7 +80,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -80,7 +80,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct SliceFunctor<DeviceType::GPU, float>;
struct SliceFunctor<DeviceType::GPU, float>; template struct SliceFunctor<DeviceType::GPU, half>;
template
struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -24,10 +24,8 @@ namespace kernels { ...@@ -24,10 +24,8 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) { uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T> template <typename T>
MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = logits->dim(0); const index_t batch = logits->dim(0);
const index_t height = logits->dim(1); const index_t height = logits->dim(1);
const index_t width = logits->dim(2); const index_t width = logits->dim(2);
...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -88,7 +86,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -88,7 +86,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0), Concat("softmax_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -26,17 +26,13 @@ namespace kernels { ...@@ -26,17 +26,13 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
Tensor *batch_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
if (b2s_) { if (b2s_) {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} }
...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
if (b2s_) { if (b2s_) {
MACE_FAILURE_RETURN(space_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); space_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "batch_to_space"; kernel_name = "batch_to_space";
} else { } else {
MACE_FAILURE_RETURN(batch_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); batch_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "space_to_batch"; kernel_name = "space_to_batch";
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
runtime->BuildKernel("space_to_batch", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -92,7 +87,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -92,7 +87,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -24,7 +24,6 @@ namespace kernels { ...@@ -24,7 +24,6 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -78,12 +77,12 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -78,12 +77,12 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key = Concat("winograd_transform_kernel",
Concat("winograd_transform_kernel", output_tensor->dim(0), output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(2), output_tensor->dim(3));
output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -188,14 +185,14 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -188,14 +185,14 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
input_tensor->dim(1)}; input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase { ...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase {
MACE_CHECK( MACE_CHECK(
this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2); this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
auto input_shape = input->shape(); auto input_shape = input->shape();
MACE_FAILURE_RETURN(output->Resize({input_shape[0] + this->paddings_[0] MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
+ this->paddings_[1], + this->paddings_[1],
input_shape[1] + this->paddings_[2] input_shape[1] + this->paddings_[2]
+ this->paddings_[3], + this->paddings_[3],
......
...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
RoundType::CEIL, RoundType::CEIL,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor); Tensor::MappingGuard output_guard(output_tensor);
......
...@@ -267,7 +267,7 @@ struct ProposalFunctor { ...@@ -267,7 +267,7 @@ struct ProposalFunctor {
// Our RPN implementation only supports a single input image, so all // Our RPN implementation only supports a single input image, so all
// batch inds are 0 // batch inds are 0
size = static_cast<int>(nms_result.size()); size = static_cast<int>(nms_result.size());
MACE_FAILURE_RETURN(output->Resize({size, 1, 1, 5})); MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5}));
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<float>();
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
......
...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor { ...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor {
const index_t num_rois = rois->dim(0); const index_t num_rois = rois->dim(0);
const index_t batch_size = input->dim(0); const index_t batch_size = input->dim(0);
MACE_FAILURE_RETURN(output->Resize({num_rois, pooled_height, pooled_width, MACE_RETURN_IF_ERROR(output->Resize({num_rois, pooled_height, pooled_width,
output_dim_})); output_dim_}));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
index_t out_width = out_width_; index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width}; std::vector<index_t> out_shape{batch, channels, out_height, out_width};
MACE_FAILURE_RETURN(output->Resize(out_shape)); MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
......
...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase { ...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
1, 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i= 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape));
output_ptrs[i] = output_list[i]->mutable_data<T>(); output_ptrs[i] = output_list[i]->mutable_data<T>();
} }
const T *input_ptr = input->data<T>(); const T *input_ptr = input->data<T>();
......
...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(space_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(batch_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
} }
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(space_tensor);
......
...@@ -15,7 +15,6 @@ cc_library( ...@@ -15,7 +15,6 @@ cc_library(
hdrs = [ hdrs = [
"ops_test_util.h", "ops_test_util.h",
], ],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
deps = [ deps = [
"//mace/core", "//mace/core",
"@gtest//:gtest", "@gtest//:gtest",
...@@ -36,18 +35,23 @@ cc_library( ...@@ -36,18 +35,23 @@ cc_library(
[ [
"buffer_to_image.cc", "buffer_to_image.cc",
"image_to_buffer.cc", "image_to_buffer.cc",
]), ],
),
hdrs = glob( hdrs = glob(
["*.h"], ["*.h"],
exclude = ["ops_test_util.h"], exclude = ["ops_test_util.h"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [ deps = [
"//mace/kernels", "//mace/kernels",
], ],
...@@ -60,13 +64,17 @@ cc_test( ...@@ -60,13 +64,17 @@ cc_test(
srcs = glob( srcs = glob(
["*_test.cc"], ["*_test.cc"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
...@@ -80,13 +88,17 @@ cc_test( ...@@ -80,13 +88,17 @@ cc_test(
name = "ops_benchmark", name = "ops_benchmark",
testonly = 1, testonly = 1,
srcs = glob(["*_benchmark.cc"]), srcs = glob(["*_benchmark.cc"]),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
......
...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> { ...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> {
functor_(kernels::StringToActivationType( functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
static_cast<T>(OperatorBase::GetOptionalArg<float>( static_cast<T>(
"max_limit", 0.0f))) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr; this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->Output(0); Tensor *output_tensor = this->Output(0);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensor)); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor));
return functor_(input_tensor, alpha_tensor, output_tensor, future); return functor_(input_tensor, alpha_tensor, output_tensor, future);
} }
......
...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { ...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::GPU>(); TestUnalignedSimpleRelu<DeviceType::GPU>();
} }
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void TestSimpleRelux() { void TestSimpleRelux() {
...@@ -169,9 +168,7 @@ void TestSimpleRelux() { ...@@ -169,9 +168,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) { TEST_F(ActivationOpTest, OPENCLSimple) { TestSimpleRelux<DeviceType::GPU>(); }
TestSimpleRelux<DeviceType::GPU>();
}
namespace { namespace {
template <DeviceType D> template <DeviceType D>
...@@ -278,9 +275,7 @@ void TestSimplePrelu() { ...@@ -278,9 +275,7 @@ void TestSimplePrelu() {
} }
} // namespace } // namespace
TEST_F(ActivationOpTest, CPUSimplePrelu) { TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
TestSimplePrelu<DeviceType::CPU>();
}
TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::GPU>(); TestSimplePrelu<DeviceType::GPU>();
......
...@@ -97,8 +97,8 @@ void SimpleAdd3() { ...@@ -97,8 +97,8 @@ void SimpleAdd3() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 2, 3, 1}, auto expected =
{-0.000713, 8, 12, 16, 20, 24}); CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
} }
...@@ -160,8 +160,8 @@ void RandomTest() { ...@@ -160,8 +160,8 @@ void RandomTest() {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-2); 1e-2);
} }
} }
} // namespace } // namespace
......
...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> { ...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> {
var->dim_size()); var->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, mean, var, epsilon_, output, future); return functor_(input, scale, offset, mean, var, epsilon_, output, future);
} }
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class BatchNormOpTest : public OpsTestBase {}; class BatchNormOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -37,14 +37,14 @@ void Simple() { ...@@ -37,14 +37,14 @@ void Simple() {
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -62,14 +62,14 @@ void Simple() { ...@@ -62,14 +62,14 @@ void Simple() {
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -79,10 +79,9 @@ void Simple() { ...@@ -79,10 +79,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
0.8291, 0.8291, 3.1708, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
} }
...@@ -103,35 +102,31 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,35 +102,31 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -140,25 +135,25 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -140,25 +135,25 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -170,7 +165,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -170,7 +165,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -186,34 +181,30 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -186,34 +181,30 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -222,26 +213,26 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -222,26 +213,26 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -253,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -253,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
...@@ -269,34 +260,30 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -269,34 +260,30 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -305,25 +292,25 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -305,25 +292,25 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -335,7 +322,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -335,7 +322,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -351,34 +338,30 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -351,34 +338,30 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -387,26 +370,26 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -387,26 +370,26 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -418,7 +401,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -418,7 +401,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
......
...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
Tensor *space_tensor = this->Output(OUTPUT); Tensor *space_tensor = this->Output(OUTPUT);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future);
future);
} }
private: private:
......
...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> { ...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> {
bias->dim_size()); bias->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, bias, output, future); return functor_(input, bias, output, future);
} }
......
...@@ -32,21 +32,17 @@ void BiasAddSimple() { ...@@ -32,21 +32,17 @@ void BiasAddSimple() {
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -79,9 +75,7 @@ void BiasAddSimple() { ...@@ -79,9 +75,7 @@ void BiasAddSimple() {
TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); } TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { BiasAddSimple<DeviceType::GPU>(); }
BiasAddSimple<DeviceType::GPU>();
}
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// generate random input // generate random input
...@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -124,9 +114,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -124,9 +114,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("InputImage")
...@@ -139,7 +129,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -139,7 +129,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
...@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -183,9 +169,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -183,9 +169,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("InputImage")
...@@ -198,7 +184,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -198,7 +184,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
......
...@@ -71,27 +71,27 @@ TEST(BufferToImageTest, ArgLarge) { ...@@ -71,27 +71,27 @@ TEST(BufferToImageTest, ArgLarge) {
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1}); {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3}); {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3}); {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedium) { TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128}); {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256}); {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::GPU, half>( TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
kernels::ARGUMENT, {2}, input_data); {2}, input_data);
} }
} // namespace test } // namespace test
......
...@@ -29,23 +29,19 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { ...@@ -29,23 +29,19 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input", {1, 1, 2, 8}, "Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("group", 4) .AddIntArg("group", 4)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -65,7 +61,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -65,7 +61,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage") .Input("InputImage")
...@@ -78,7 +74,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -78,7 +74,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
// Transfer output // Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <functional> #include <functional>
#include <string>
#include "gmock/gmock.h" #include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h" #include "mace/ops/concat.h"
#include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
int concat_axis_size = 0; int concat_axis_size = 0;
// Construct graph // Construct graph
std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>()); std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
std::vector<const float*> input_ptrs(num_inputs); std::vector<const float *> input_ptrs(num_inputs);
OpsTestNet net; OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i); const std::string input_name = MakeString("Input", i);
...@@ -171,10 +171,10 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -171,10 +171,10 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]); GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name, net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
shapes[i], inputs[i]); inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name, BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
...@@ -191,7 +191,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -191,7 +191,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto output = net.GetOutput("Output"); auto output = net.GetOutput("Output");
......
...@@ -25,40 +25,36 @@ namespace test { ...@@ -25,40 +25,36 @@ namespace test {
class Conv2dOpTest : public OpsTestBase {}; class Conv2dOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCSimple3x3VALID() { void TestNHWCSimple3x3VALID() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -67,15 +63,15 @@ void TestNHWCSimple3x3VALID() { ...@@ -67,15 +63,15 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(&net, "Bias", "BiasImage", BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
...@@ -91,41 +87,37 @@ void TestNHWCSimple3x3VALID() { ...@@ -91,41 +87,37 @@ void TestNHWCSimple3x3VALID() {
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCSimple3x3SAME() { void TestNHWCSimple3x3SAME() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -134,15 +126,15 @@ void TestNHWCSimple3x3SAME() { ...@@ -134,15 +126,15 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(&net, "Bias", "BiasImage", BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -155,8 +147,8 @@ void TestNHWCSimple3x3SAME() { ...@@ -155,8 +147,8 @@ void TestNHWCSimple3x3SAME() {
} }
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 3, 3, 1}, {1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -173,40 +165,36 @@ TEST_F(Conv2dOpTest, OPENCLSimple) { ...@@ -173,40 +165,36 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() { void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -214,14 +202,14 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -214,14 +202,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
...@@ -247,45 +235,40 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) { ...@@ -247,45 +235,40 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCCombined3x3() { void TestNHWCCombined3x3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, T>(
"Filter", {2, 2, 3, 3}, "Filter", {2, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -295,15 +278,15 @@ void TestNHWCCombined3x3() { ...@@ -295,15 +278,15 @@ void TestNHWCCombined3x3() {
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -315,8 +298,8 @@ void TestNHWCCombined3x3() { ...@@ -315,8 +298,8 @@ void TestNHWCCombined3x3() {
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
...@@ -330,14 +313,13 @@ TEST_F(Conv2dOpTest, OPENCLStride2) { ...@@ -330,14 +313,13 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestFusedNHWCSimple3x3VALID() { void TestFusedNHWCSimple3x3VALID() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
-1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
...@@ -345,9 +327,7 @@ void TestFusedNHWCSimple3x3VALID() { ...@@ -345,9 +327,7 @@ void TestFusedNHWCSimple3x3VALID() {
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f}); net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -362,10 +342,8 @@ void TestFusedNHWCSimple3x3VALID() { ...@@ -362,10 +342,8 @@ void TestFusedNHWCSimple3x3VALID() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -398,25 +376,21 @@ void TestFusedNHWCSimple3x3VALID() { ...@@ -398,25 +376,21 @@ void TestFusedNHWCSimple3x3VALID() {
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f}); auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestFusedNHWCSimple3x3WithoutBias() { void TestFusedNHWCSimple3x3WithoutBias() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
-1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -431,10 +405,8 @@ void TestFusedNHWCSimple3x3WithoutBias() { ...@@ -431,10 +405,8 @@ void TestFusedNHWCSimple3x3WithoutBias() {
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -478,48 +450,43 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) { ...@@ -478,48 +450,43 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>(); TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
} }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void TestConv1x1() { void TestConv1x1() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5}, "Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {2, 5, 1, 1}, "Filter", {2, 5, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -529,14 +496,14 @@ void TestConv1x1() { ...@@ -529,14 +496,14 @@ void TestConv1x1() {
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -548,13 +515,13 @@ void TestConv1x1() { ...@@ -548,13 +515,13 @@ void TestConv1x1() {
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 3, 10, 2}, {1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -565,7 +532,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); } ...@@ -565,7 +532,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); } TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestComplexConvNxNS12(const std::vector<index_t> &shape, void TestComplexConvNxNS12(const std::vector<index_t> &shape,
const int stride) { const int stride) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
...@@ -584,33 +551,28 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape, ...@@ -584,33 +551,28 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>( net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -625,22 +587,22 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape, ...@@ -625,22 +587,22 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
*net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); 1e-4);
}; };
for (int kernel_size : {1, 3, 5, 7}) { for (int kernel_size : {1, 3, 5, 7}) {
...@@ -666,7 +628,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { ...@@ -666,7 +628,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
} }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape, const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) { const std::vector<int> &dilations) {
...@@ -690,40 +652,36 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -690,40 +652,36 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
&float_input_data); &float_input_data);
std::vector<float> float_filter_data; std::vector<float> float_filter_data;
GenerateRandomRealTypeData( GenerateRandomRealTypeData(
{kernel_h, kernel_w, output_channels, input_channels}, {kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data); &float_filter_data);
std::vector<float> float_bias_data; std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data); GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data); "Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, "Filter", {output_channels, input_channels, kernel_h, kernel_w},
float_filter_data); float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data); net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]}) .AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -738,23 +696,23 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -738,23 +696,23 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]}) .AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
*net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); 1e-1);
}; };
func(1, 1, VALID); func(1, 1, VALID);
...@@ -767,20 +725,16 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -767,20 +725,16 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64}, {1, 1});
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64}, {1, 1});
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 64}, {1, 1});
{1, 1}); TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 63}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 63},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x7S1) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x7S1) {
...@@ -800,55 +754,45 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x1S1) { ...@@ -800,55 +754,45 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x1S1) {
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x7S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x7S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64}, {1, 1});
{1, 1}); TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 63}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 63},
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2}, {1, 1});
{1, 1}); TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {15, 1, 32, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {15, 1, 32, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2}, {1, 1});
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {1, 15, 32, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {1, 15, 32, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7}, TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7}, {1, 1});
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7}, TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7}, {1, 1});
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16}, {2, 2});
{2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16}, {2, 2});
{2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16}, {4, 4});
{4, 4});
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestDilationConvNxN(const std::vector<index_t> &shape, void TestDilationConvNxN(const std::vector<index_t> &shape,
const int dilation_rate) { const int dilation_rate) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
...@@ -868,33 +812,28 @@ void TestDilationConvNxN(const std::vector<index_t> &shape, ...@@ -868,33 +812,28 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>( net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate}) .AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -909,22 +848,22 @@ void TestDilationConvNxN(const std::vector<index_t> &shape, ...@@ -909,22 +848,22 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate}) .AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4, 1e-4); 1e-4);
}; };
for (int kernel_size : {3}) { for (int kernel_size : {3}) {
...@@ -949,7 +888,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) { ...@@ -949,7 +888,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
} }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape, const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) { const std::vector<int> &dilations) {
...@@ -975,9 +914,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -975,9 +914,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -993,10 +930,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -993,10 +930,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
...@@ -1024,8 +959,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -1024,8 +959,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-1); 1e-1);
}; };
func(1, 1, VALID); func(1, 1, VALID);
...@@ -1034,17 +969,16 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -1034,17 +969,16 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) { TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3}, TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3}, {2, 2});
{2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) { TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16}, TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
{2, 2}); {2, 2});
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) { const std::vector<int> &paddings) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
...@@ -1063,31 +997,27 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, ...@@ -1063,31 +997,27 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>( net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings) .AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -1102,21 +1032,21 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, ...@@ -1102,21 +1032,21 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings) .AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4, 1e-4); 1e-4);
}; };
for (int kernel_size : {3, 5, 7}) { for (int kernel_size : {3, 5, 7}) {
...@@ -1132,8 +1062,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad1) { ...@@ -1132,8 +1062,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
} }
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, {2, 2});
{2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
......
...@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> { ...@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>( padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))), "padding", static_cast<int>(SAME)))),
paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")), paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")),
dilations_( dilations_(OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
protected: protected:
std::vector<int> strides_; std::vector<int> strides_;
......
...@@ -31,9 +31,8 @@ TEST(CoreTest, INIT_MODE) { ...@@ -31,9 +31,8 @@ TEST(CoreTest, INIT_MODE) {
.AddIntArg("mode", static_cast<int>(NetMode::INIT)) .AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<float>::v());
DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3}); input->Resize({1, 3, 3, 3});
{ {
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
......
...@@ -25,7 +25,7 @@ namespace test { ...@@ -25,7 +25,7 @@ namespace test {
class Deconv2dOpTest : public OpsTestBase {}; class Deconv2dOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void RunTestSimple(const std::vector<index_t> &input_shape, void RunTestSimple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data, const std::vector<float> &input_data,
const int stride, const int stride,
...@@ -40,10 +40,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -40,10 +40,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data); net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
net.TransformDataFormat<D, float>("Filter", net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
HWOI,
"FilterOIHW",
OIHW);
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
...@@ -66,9 +63,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -66,9 +63,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -81,317 +76,165 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -81,317 +76,165 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} }
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple3x3SAME_S1() { void TestNHWCSimple3x3SAME_S1() {
RunTestSimple<D>({1, 3, 3, 1}, RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, Padding::SAME,
{1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
1, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Padding::SAME, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{0, 0}, {1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
{1, 3, 3, 3}, 9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4});
{3, 3, 3, 1}, RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, Padding::VALID,
{1, 1, 1, 1, 1, 1, 1, 1, 1, {2, 2}, {0}, {3, 3, 3, 1},
1, 1, 1, 1, 1, 1, 1, 1, 1, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 3, 3, 3}, {1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
{4, 4, 4, 6, 6, 6, 4, 4, 4, 9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4});
6, 6, 6, 9, 9, 9, 6, 6, 6, RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::SAME,
4, 4, 4, 6, 6, 6, 4, 4, 4}); {0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
{1, 1, 1, 1, 1, 1, 1, 1, 1}, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
1, {1, 3, 3, 3}, {54, 66, 78, 126, 147, 168, 130, 146, 162,
Padding::VALID, 198, 225, 252, 405, 450, 495, 366, 399, 432,
{2, 2}, 354, 378, 402, 630, 669, 708, 502, 530, 558});
{0}, RunTestSimple<D>(
{3, 3, 3, 1}, {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::SAME, {2, 2}, {0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, {3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1, 1, 1, 1, 1, 1, 1, 1, 1, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
1, 1, 1, 1, 1, 1, 1, 1, 1}, {1, 3, 3, 3},
{1, 3, 3, 3}, {54, 66, 78, 126, 147, 168, 130, 146, 162, 198, 225, 252, 405, 450,
{4, 4, 4, 6, 6, 6, 4, 4, 4, 495, 366, 399, 432, 354, 378, 402, 630, 669, 708, 502, 530, 558});
6, 6, 6, 9, 9, 9, 6, 6, 6,
4, 4, 4, 6, 6, 6, 4, 4, 4});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::SAME,
{0, 0},
{1, 3, 3, 3},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3},
{54, 66, 78, 126, 147, 168, 130, 146, 162,
198, 225, 252, 405, 450, 495, 366, 399, 432,
354, 378, 402, 630, 669, 708, 502, 530, 558});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::SAME,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3},
{54, 66, 78, 126, 147, 168, 130, 146, 162,
198, 225, 252, 405, 450, 495, 366, 399, 432,
354, 378, 402, 630, 669, 708, 502, 530, 558});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple3x3SAME_S2() { void TestNHWCSimple3x3SAME_S2() {
RunTestSimple<D>({1, 3, 3, 1}, RunTestSimple<D>(
{1, 1, 1, 1, 1, 1, 1, 1, 1}, {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {0, 0},
2, {1, 6, 6, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Padding::SAME, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{0, 0}, {1, 6, 6, 3},
{1, 6, 6, 3}, {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
{3, 3, 3, 1}, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4,
{1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
{1, 6, 6, 3}, RunTestSimple<D>(
{1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {2, 2}, {0},
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, {1, 5, 5, 3}, {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1}); 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4,
RunTestSimple<D>({1, 3, 3, 1}, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
{1, 1, 1, 1, 1, 1, 1, 1, 1}, RunTestSimple<D>(
2, {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {0, 0},
Padding::SAME, {1, 6, 6, 3}, {3, 3, 3, 1},
{2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
{0}, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{3, 3, 3, 1}, {1, 6, 6, 3},
{1, 1, 1, 1, 1, 1, 1, 1, 1, {1, 2, 3, 4, 5, 6, 9, 12, 15, 8, 10, 12, 17, 22,
1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 12, 15, 18, 10, 11, 12, 13, 14, 15, 36, 39, 42, 26,
1, 1, 1, 1, 1, 1, 1, 1, 1}, 28, 30, 62, 67, 72, 39, 42, 45, 23, 28, 33, 38, 43, 48,
{1, 5, 5, 3}, 96, 108, 120, 64, 71, 78, 148, 164, 180, 90, 99, 108, 40, 44,
{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 48, 52, 56, 60, 114, 123, 132, 65, 70, 75, 140, 151, 162, 78,
2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 84, 90, 83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168,
1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192, 207,
2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135});
1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1}); RunTestSimple<D>(
RunTestSimple<D>({1, 3, 3, 1}, {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {2, 2}, {0},
{1, 2, 3, 4, 5, 6, 7, 8, 9}, {3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
Padding::SAME, {1, 5, 5, 3},
{0, 0}, {13, 14, 15, 36, 39, 42, 26, 28, 30, 62, 67, 72, 39,
{1, 6, 6, 3}, 42, 45, 38, 43, 48, 96, 108, 120, 64, 71, 78, 148, 164,
{3, 3, 3, 1}, 180, 90, 99, 108, 52, 56, 60, 114, 123, 132, 65, 70, 75,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300, 142,
10, 11, 12, 13, 14, 15, 16, 17, 18, 155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105, 192, 207,
19, 20, 21, 22, 23, 24, 25, 26, 27}, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135});
{1, 6, 6, 3},
{1, 2, 3, 4, 5, 6, 9, 12, 15,
8, 10, 12, 17, 22, 27, 12, 15, 18,
10, 11, 12, 13, 14, 15, 36, 39, 42,
26, 28, 30, 62, 67, 72, 39, 42, 45,
23, 28, 33, 38, 43, 48, 96, 108, 120,
64, 71, 78, 148, 164, 180, 90, 99, 108,
40, 44, 48, 52, 56, 60, 114, 123, 132,
65, 70, 75, 140, 151, 162, 78, 84, 90,
83, 94, 105, 116, 127, 138, 252, 276, 300,
142, 155, 168, 304, 332, 360, 168, 183, 198,
70, 77, 84, 91, 98, 105, 192, 207, 222,
104, 112, 120, 218, 235, 252, 117, 126, 135});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
2,
Padding::SAME,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{13, 14, 15, 36, 39, 42,
26, 28, 30, 62, 67, 72, 39, 42, 45,
38, 43, 48, 96, 108, 120,
64, 71, 78, 148, 164, 180, 90, 99, 108,
52, 56, 60, 114, 123, 132,
65, 70, 75, 140, 151, 162, 78, 84, 90,
116, 127, 138, 252, 276, 300,
142, 155, 168, 304, 332, 360, 168, 183, 198,
91, 98, 105, 192, 207, 222,
104, 112, 120, 218, 235, 252, 117, 126, 135});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple3x3SAME_S2_1() { void TestNHWCSimple3x3SAME_S2_1() {
RunTestSimple<D>({1, 3, 3, 1}, RunTestSimple<D>(
{12, 18, 12, 18, 27, 18, 12, 18, 12}, {1, 3, 3, 1}, {12, 18, 12, 18, 27, 18, 12, 18, 12}, 2, Padding::SAME,
2, {0, 0}, {1, 5, 5, 3}, {3, 3, 3, 1},
Padding::SAME, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
{0, 0}, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 5, 5, 3}, {1, 5, 5, 3},
{3, 3, 3, 1}, {12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12,
{1, 1, 1, 1, 1, 1, 1, 1, 1, 30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18,
1, 1, 1, 1, 1, 1, 1, 1, 1}, 30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
{1, 5, 5, 3}, 12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12});
{12, 12, 12, 30, 30, 30, 18, 18, 18,
30, 30, 30, 12, 12, 12,
30, 30, 30, 75, 75, 75, 45, 45, 45,
75, 75, 75, 30, 30, 30,
18, 18, 18, 45, 45, 45, 27, 27, 27,
45, 45, 45, 18, 18, 18,
30, 30, 30, 75, 75, 75, 45, 45, 45,
75, 75, 75, 30, 30, 30,
12, 12, 12, 30, 30, 30, 18, 18, 18,
30, 30, 30, 12, 12, 12});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple3x3VALID_S2() { void TestNHWCSimple3x3VALID_S2() {
RunTestSimple<D>({1, 3, 3, 1}, RunTestSimple<D>(
{1, 1, 1, 1, 1, 1, 1, 1, 1}, {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::VALID, {0, 0},
2, {1, 7, 7, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Padding::VALID, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{0, 0}, {1, 7, 7, 3},
{1, 7, 7, 3}, {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
{3, 3, 3, 1}, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2,
{1, 7, 7, 3}, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
{1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1});
1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,
4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,
4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple3x3VALID_S1() { void TestNHWCSimple3x3VALID_S1() {
RunTestSimple<D>({1, 3, 3, 1}, RunTestSimple<D>(
{1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {0, 0},
1, {1, 5, 5, 3}, {3, 3, 3, 1},
Padding::VALID, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
{0, 0}, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3}, {1, 5, 5, 3},
{3, 3, 3, 1}, {1, 2, 3, 6, 9, 12, 18, 24, 30, 26, 31, 36, 21,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 24, 27, 14, 19, 24, 54, 66, 78, 126, 147, 168, 130, 146,
10, 11, 12, 13, 14, 15, 16, 17, 18, 162, 90, 99, 108, 66, 78, 90, 198, 225, 252, 405, 450, 495,
19, 20, 21, 22, 23, 24, 25, 26, 27}, 366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402, 630,
{1, 5, 5, 3}, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147, 306, 321,
{1, 2, 3, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243});
6, 9, 12, RunTestSimple<D>(
18, 24, 30, {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {4, 4}, {0},
26, 31, 36, {3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
21, 24, 27, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
14, 19, 24, {1, 5, 5, 3},
54, 66, 78, {1, 2, 3, 6, 9, 12, 18, 24, 30, 26, 31, 36, 21,
126, 147, 168, 24, 27, 14, 19, 24, 54, 66, 78, 126, 147, 168, 130, 146,
130, 146, 162, 162, 90, 99, 108, 66, 78, 90, 198, 225, 252, 405, 450, 495,
90, 99, 108, 366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402, 630,
66, 78, 90, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147, 306, 321,
198, 225, 252, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243});
405, 450, 495,
366, 399, 432,
234, 252, 270,
146, 157, 168,
354, 378, 402,
630, 669, 708,
502, 530, 558,
294, 309, 324,
133, 140, 147,
306, 321, 336,
522, 546, 570,
398, 415, 432,
225, 234, 243});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::VALID,
{4, 4},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{1, 2, 3,
6, 9, 12,
18, 24, 30,
26, 31, 36,
21, 24, 27,
14, 19, 24,
54, 66, 78,
126, 147, 168,
130, 146, 162,
90, 99, 108,
66, 78, 90,
198, 225, 252,
405, 450, 495,
366, 399, 432,
234, 252, 270,
146, 157, 168,
354, 378, 402,
630, 669, 708,
502, 530, 558,
294, 309, 324,
133, 140, 147,
306, 321, 336,
522, 546, 570,
398, 415, 432,
225, 234, 243});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple2x2SAME() { void TestNHWCSimple2x2SAME() {
RunTestSimple<D>({1, 2, 2, 1}, RunTestSimple<D>({1, 2, 2, 1}, {1, 1, 1, 1}, 1, Padding::SAME, {0, 0},
{1, 1, 1, 1}, {1, 2, 2, 1}, {3, 3, 1, 1},
1,
Padding::SAME,
{0, 0},
{1, 2, 2, 1},
{3, 3, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
{1, 2, 2, 1}, {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f});
{4.f, 4.f, 4.f, 4.f});
} }
template<DeviceType D> template <DeviceType D>
void TestNHWCSimple2x2VALID() { void TestNHWCSimple2x2VALID() {
RunTestSimple<D>({1, 2, 2, 1}, RunTestSimple<D>(
{1, 1, 1, 1}, {1, 2, 2, 1}, {1, 1, 1, 1}, 2, Padding::VALID, {0, 0}, {1, 5, 5, 1},
2, {3, 3, 1, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
Padding::VALID, {1, 5, 5, 1},
{0, 0}, {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
{1, 5, 5, 1}, 2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f});
{3, 3, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
{1, 5, 5, 1},
{1.f, 1.f, 2.f, 1.f, 1.f,
1.f, 1.f, 2.f, 1.f, 1.f,
2.f, 2.f, 4.f, 2.f, 2.f,
1.f, 1.f, 2.f, 1.f, 1.f,
1.f, 1.f, 2.f, 1.f, 1.f});
} }
} // namespace } // namespace
...@@ -400,11 +243,11 @@ TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S1) { ...@@ -400,11 +243,11 @@ TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S1) {
} }
TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2) { TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2) {
TestNHWCSimple3x3SAME_S2<DeviceType::CPU>(); TestNHWCSimple3x3SAME_S2<DeviceType::CPU>();
} }
TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2_1) { TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2_1) {
TestNHWCSimple3x3SAME_S2_1<DeviceType::CPU>(); TestNHWCSimple3x3SAME_S2_1<DeviceType::CPU>();
} }
TEST_F(Deconv2dOpTest, CPUSimple2X2PaddingSame) { TEST_F(Deconv2dOpTest, CPUSimple2X2PaddingSame) {
...@@ -432,11 +275,11 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S1) { ...@@ -432,11 +275,11 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S1) {
} }
TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2) { TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2) {
TestNHWCSimple3x3SAME_S2<DeviceType::GPU>(); TestNHWCSimple3x3SAME_S2<DeviceType::GPU>();
} }
TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2_1) { TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2_1) {
TestNHWCSimple3x3SAME_S2_1<DeviceType::GPU>(); TestNHWCSimple3x3SAME_S2_1<DeviceType::GPU>();
} }
TEST_F(Deconv2dOpTest, OPENCLSimple2X2PaddingValid) { TEST_F(Deconv2dOpTest, OPENCLSimple2X2PaddingValid) {
...@@ -452,7 +295,7 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) { ...@@ -452,7 +295,7 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestComplexDeconvNxNS12(const int batch, void TestComplexDeconvNxNS12(const int batch,
const std::vector<int> &shape, const std::vector<int> &shape,
const int stride) { const int stride) {
...@@ -473,14 +316,12 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -473,14 +316,12 @@ void TestComplexDeconvNxNS12(const int batch,
net.AddRandomInput<D, T>( net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
int out_h = 0; int out_h = 0;
int out_w = 0; int out_w = 0;
std::vector<int>paddings; std::vector<int> paddings;
std::vector<int> output_shape; std::vector<int> output_shape;
if (padding < 0) { if (padding < 0) {
...@@ -496,8 +337,8 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -496,8 +337,8 @@ void TestComplexDeconvNxNS12(const int batch,
output_shape.push_back(out_w); output_shape.push_back(out_w);
output_shape.push_back(output_channels); output_shape.push_back(output_channels);
} else { } else {
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1; // out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1; // out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
paddings.push_back(padding); paddings.push_back(padding);
paddings.push_back(padding); paddings.push_back(padding);
} }
...@@ -514,14 +355,11 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -514,14 +355,11 @@ void TestComplexDeconvNxNS12(const int batch,
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -551,8 +389,8 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -551,8 +389,8 @@ void TestComplexDeconvNxNS12(const int batch,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
*net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); 1e-4);
}; };
for (int kernel_size : {1, 3, 5, 7}) { for (int kernel_size : {1, 3, 5, 7}) {
...@@ -575,8 +413,8 @@ TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) { ...@@ -575,8 +413,8 @@ TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) {
} }
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) { TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1); TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2); TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
} }
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) { TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
......
...@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s, ...@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s,
const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest"; const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
// Construct graph // Construct graph
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW") .Input("InputNCHW")
...@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s, ...@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else { } else {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
...@@ -64,118 +60,98 @@ void RunDepthToSpace(const bool d2s, ...@@ -64,118 +60,98 @@ void RunDepthToSpace(const bool d2s,
net.RunOp(D); net.RunOp(D);
} }
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
class SpaceToDepthOpTest : public OpsTestBase {}; class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 4, 4}, RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 1, 2, 16},
{1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4}, RunDepthToSpace<DeviceType::GPU>(
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 1, 2, 16},
{1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 2, 4}, RunDepthToSpace<DeviceType::CPU>(
{1, 2, 3, 4, 5, 6, 7, 8, false, {1, 2, 2, 4},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4}, RunDepthToSpace<DeviceType::GPU>(
{1, 2, 3, 4, 5, 6, 7, 8, false, {1, 2, 2, 4},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
class DepthToSpaceOpTest : public OpsTestBase {}; class DepthToSpaceOpTest : public OpsTestBase {};
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 2, 16}, RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 2, 4, 4},
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16}, RunDepthToSpace<DeviceType::GPU>(
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, true, {1, 1, 2, 16},
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{1, 2, 4, 4}, 2, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 1, 16}, RunDepthToSpace<DeviceType::CPU>(
{1, 2, 3, 4, 5, 6, 7, 8, true, {1, 1, 1, 16},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16}, RunDepthToSpace<DeviceType::GPU>(
{1, 2, 3, 4, 5, 6, 7, 8, true, {1, 1, 1, 16},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0); const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, in, 2,
in, {1, 384, 384, 32}, in);
2,
{1, 384, 384, 32},
in);
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const bool d2s, const int block_size, void RandomTest(const bool d2s,
const int block_size,
const std::vector<index_t> &shape) { const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
srand(time(NULL)); srand(time(NULL));
...@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size, ...@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size,
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", shape); net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW") .Input("InputNCHW")
...@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size, ...@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size,
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg", BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { ...@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
} }
} // namespace test } // namespace test
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {}; class DepthwiseConv2dOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void SimpleValidTest() { void SimpleValidTest() {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
// Construct graph // Construct graph
...@@ -30,31 +30,27 @@ void SimpleValidTest() { ...@@ -30,31 +30,27 @@ void SimpleValidTest() {
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f}); "Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}); net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -63,15 +59,15 @@ void SimpleValidTest() { ...@@ -63,15 +59,15 @@ void SimpleValidTest() {
BufferToImage<D, T>(&net, "Bias", "BiasImage", BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
...@@ -85,8 +81,8 @@ void SimpleValidTest() { ...@@ -85,8 +81,8 @@ void SimpleValidTest() {
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f, {1, 2, 2, 2},
67.1f, 268.2f, 77.1f, 308.2f}); {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
...@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { ...@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void ComplexValidTest(index_t batch, index_t channel, index_t height, void ComplexValidTest(index_t batch,
index_t width, index_t kernel, index_t multiplier, index_t channel,
index_t height,
index_t width,
index_t kernel,
index_t multiplier,
int stride) { int stride) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
// Construct graph // Construct graph
...@@ -125,35 +125,29 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -125,35 +125,29 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
std::vector<float> filter_data(kernel * kernel * channel * multiplier); std::vector<float> filter_data(kernel * kernel * channel * multiplier);
GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
&filter_data); &filter_data);
net.AddInputFromArray<D, float>("Filter", net.AddInputFromArray<D, float>(
{multiplier, channel, kernel, kernel}, "Filter", {multiplier, channel, kernel, kernel}, filter_data);
filter_data);
std::vector<float> bias_data(channel * multiplier); std::vector<float> bias_data(channel * multiplier);
GenerateRandomRealTypeData({channel * multiplier}, &bias_data); GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, bias_data);
bias_data);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -162,15 +156,15 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -162,15 +156,15 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
BufferToImage<D, T>(&net, "Bias", "BiasImage", BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
...@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
} }
} }
auto expected = CreateTensor<T>( auto expected =
{1, out_height, out_width, out_channels}, expect); CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { ...@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
} }
namespace { namespace {
template<typename T> template <typename T>
void TestNxNS12(const index_t height, const index_t width) { void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
...@@ -263,74 +257,66 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -263,74 +257,66 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width,
input_channels});
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w}); "Input", {batch, height, width, input_channels});
net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::GPU, float>("Bias", net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier {multiplier * input_channels});
* input_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage", BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
// Transfer output // Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
"OutputImage", kernels::BufferType::IN_OUT_CHANNEL);
"DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-5, 1e-4); 1e-4);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
1e-2, 1e-2); 1e-2);
} }
}; };
...@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) {
} }
} // namespace } // namespace
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TestNxNS12<float>(4, 4); }
TestNxNS12<float>(4, 4);
}
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) { TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12<half>(4, 4); TestNxNS12<half>(4, 4);
......
...@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> { ...@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> {
public: public:
EltwiseOp(const OperatorDef &op_def, Workspace *ws) EltwiseOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(static_cast<kernels::EltwiseType>( functor_(
OperatorBase::GetOptionalArg<int>( static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))), "type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"), OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("x", 1.0)) {} OperatorBase::GetOptionalArg<float>("x", 1.0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor* input0 = this->Input(0); const Tensor *input0 = this->Input(0);
const Tensor* input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
return functor_(input0, input1, output, future); return functor_(input0, input1, output, future);
} }
......
...@@ -36,10 +36,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, ...@@ -36,10 +36,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input", shape, input); net.AddInputFromArray<D, float>("Input", shape, input);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", net.TransformDataFormat<D, float>("Input", NHWC, "TInput", NCHW);
NHWC,
"TInput",
NCHW);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput") .Input("TInput")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
...@@ -48,13 +45,10 @@ void SimpleTensorScalar(const kernels::EltwiseType type, ...@@ -48,13 +45,10 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("TOutput", net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
NCHW,
"Output",
NHWC);
} else { } else {
BufferToImage<D, T>(&net, "Input", "InputImg", BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg") .Input("InputImg")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
...@@ -90,10 +84,8 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, ...@@ -90,10 +84,8 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input1", shape1, input1); net.AddInputFromArray<D, float>("Input1", shape1, input1);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input0", NHWC, net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
"TInput0", NCHW); net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
net.TransformDataFormat<D, float>("Input1", NHWC,
"TInput1", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0") .Input("TInput0")
.Input("TInput1") .Input("TInput1")
...@@ -104,13 +96,12 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, ...@@ -104,13 +96,12 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("TOutput", NCHW, net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
"Output", NHWC);
} else { } else {
BufferToImage<D, T>(&net, "Input0", "InputImg0", BufferToImage<D, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Input1", "InputImg1", BufferToImage<D, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0") .Input("InputImg0")
.Input("InputImg1") .Input("InputImg1")
...@@ -138,278 +129,181 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, ...@@ -138,278 +129,181 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) { TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUM, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1, {1, 1, 1, 1}, {1}, 1, {2});
{2});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUB, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {0, 1, 2, 3, 4, 5});
1,
{0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::PROD, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 2, {2, 4, 6, 8, 10, 12});
2,
{2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::DIV, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3}, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
{2, 4, 6, 8, 10, 12}, 2, {1, 2, 3, 4, 5, 6});
2,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MIN, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {1, 1, 1, 1, 1, 1});
1,
{1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MAX, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 3, {3, 3, 3, 4, 5, 6});
3,
{3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::NEG, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 3, {-1, -2, -3, -4, -5, -6});
3, SimpleTensorScalar<DeviceType::CPU, float>(
{-1, -2, -3, -4, -5, -6}); kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::ABS, {1, 2, 3, 4, 5, 6});
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF, SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {0, 1, 4, 9, 16, 25});
1,
{0, 1, 4, 9, 16, 25});
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) { TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1, {1, 1, 1, 1}, {1}, 1, {2});
{2});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {0, 1, 2, 3, 4, 5});
1,
{0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 2, {2, 4, 6, 8, 10, 12});
2,
{2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3}, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
{2, 4, 6, 8, 10, 12}, 2, {1, 2, 3, 4, 5, 6});
2,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {1, 1, 1, 1, 1, 1});
1,
{1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 3, {3, 3, 3, 4, 5, 6});
3,
{3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 3, {-1, -2, -3, -4, -5, -6});
3, SimpleTensorScalar<DeviceType::GPU, float>(
{-1, -2, -3, -4, -5, -6}); kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS, {1, 2, 3, 4, 5, 6});
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, 1, {0, 1, 4, 9, 16, 25});
1,
{0, 1, 4, 9, 16, 25});
} }
TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
{1, 1, 1, 3}, {1, 2, 3}, {1, 2, 3}, {2, 4, 6, 5, 7, 9});
{2, 4, 6, 5, 7, 9}); SimpleTensorEltwise<DeviceType::CPU, float>(
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB, kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, SimpleTensorEltwise<DeviceType::CPU, float>(
{0, 0, 0, 0, 0, kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
-5, -5, -5, -5, -5}); {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::PROD, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 3}, {1, 2, 3}, kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
{1, 4, 9, 4, 10, 18}); SimpleTensorEltwise<DeviceType::CPU, float>(
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV, kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 5}, {1, 1, 1, 1, 5}, kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2}); {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 5}, {1, 1, 1, 2, 4}, kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25,
25});
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) { TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3}, {2, 4, 6, 5, 7, 9});
{1, 1, 1, 3}, {1, 2, 3},
{2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD, kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 1, 1, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
{1, 1, 1, 5}, {1, 1, 1, 1, 5},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
{1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN, kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX, kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF, kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25}); {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
} }
TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) { TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
{2, 4, 6, 8, 10, 12}); SimpleTensorEltwise<DeviceType::CPU, float>(
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, SimpleTensorEltwise<DeviceType::CPU, float>(
{0.2, 0.4, 0.6, 0.8, 1, 1.2}, kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
{0.1, 0.1}); {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
{0, 0, 0, 0, 0}); SimpleTensorEltwise<DeviceType::CPU, float>(
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::PROD, kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 4, 9, 16, 25, 36}); kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 1, 1, 1, 1, 1}); kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, SimpleTensorEltwise<DeviceType::CPU, float>(
{1, 2, 1, 5}, kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25,
25});
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) { TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{0.2, 0.4, 0.6, 0.8, 1, 1.2},
{0.1, 0.1});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD, kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN, kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX, kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::GPU, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF, kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
} }
namespace { namespace {
...@@ -422,9 +316,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -422,9 +316,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NHWC,
"TInput",
NCHW); NCHW);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput") .Input("TInput")
...@@ -434,15 +326,13 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -434,15 +326,13 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg") .Input("InputImg")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
...@@ -455,13 +345,12 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -455,13 +345,12 @@ void RandomTensorScalar(const kernels::EltwiseType type,
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
1e-2);
} }
} }
...@@ -477,10 +366,10 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -477,10 +366,10 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
"TInput0", NCHW); NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
"TInput1", NCHW); NCHW);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0") .Input("TInput0")
.Input("TInput1") .Input("TInput1")
...@@ -491,15 +380,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -491,15 +380,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
// Run // Run
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
"Output", NHWC); NHWC);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0", BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1", BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0") .Input("InputImg0")
.Input("InputImg1") .Input("InputImg1")
...@@ -513,13 +402,12 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -513,13 +402,12 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
1e-2);
} }
} }
} // namespace } // namespace
...@@ -549,88 +437,87 @@ TEST_F(EltwiseOpTest, RandomTensorScalarHalf) { ...@@ -549,88 +437,87 @@ TEST_F(EltwiseOpTest, RandomTensorScalarHalf) {
} }
TEST_F(EltwiseOpTest, RandomTensorVecFloat) { TEST_F(EltwiseOpTest, RandomTensorVecFloat) {
RandomTensorEltwise<float>(kernels::EltwiseType::SUM, RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 1, 1, 16}); {1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
{5, 32, 32, 16}, {5, 1, 1, 16}); {5, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
{5, 32, 32, 16}, {1, 1, 1, 16}); {1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 1, 1, 16},
{5, 1, 1, 16}, {5, 32, 32, 16}); {5, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 31, 37, 17}, {1, 1, 1, 17}); {1, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
{1, 1, 1, 17}, {1, 31, 37, 17}); {1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV, RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {3, 1, 1, 17},
{3, 1, 1, 17}, {3, 31, 37, 17}); {3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN, RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
{1, 1, 1, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX, RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {5, 31, 37, 17},
{5, 31, 37, 17}, {5, 1, 1, 17}); {5, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {5, 31, 37, 17},
{5, 31, 37, 17}, {5, 1, 1, 17}); {5, 1, 1, 17});
} }
TEST_F(EltwiseOpTest, RandomTensorVecHalf) { TEST_F(EltwiseOpTest, RandomTensorVecHalf) {
RandomTensorEltwise<half>(kernels::EltwiseType::SUM, RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 1, 1, 16}); {1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16}, {3, 1, 1, 16}); {3, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16}, {1, 1, 1, 16}); {1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 1, 1, 16},
{3, 1, 1, 16}, {3, 32, 32, 16}); {3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD, RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
{1, 1, 1, 17}, {1, 31, 37, 17}); {1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17}, {5, 1, 1, 17}); {5, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17}, {1, 1, 1, 17}); {1, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 1, 1, 17},
{5, 1, 1, 17}, {5, 31, 37, 17}); {5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN, RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
{1, 1, 1, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX, RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 1, 1, 17}); {3, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 1, 1, 17}); {3, 1, 1, 17});
} }
TEST_F(EltwiseOpTest, RandomTensorTensorFloat) { TEST_F(EltwiseOpTest, RandomTensorTensorFloat) {
RandomTensorEltwise<float>(kernels::EltwiseType::SUM, RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16}, {3, 32, 32, 16}); {3, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 31, 37, 17}, {1, 31, 37, 17}); {1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV, RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17}, {5, 31, 37, 17}); {5, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN, RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX, RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 31, 37, 17}); {3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 31, 37, 17}); {3, 31, 37, 17});
} }
TEST_F(EltwiseOpTest, RandomTensorTensorHalf) { TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
RandomTensorEltwise<half>(kernels::EltwiseType::SUM, RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16}, {3, 32, 32, 16}); {3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD, RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 31, 37, 17}, {1, 31, 37, 17}); {1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17}, {5, 31, 37, 17}); {5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN, RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
{1, 32, 32, 16}, {1, 32, 32, 16}); {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX, RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 31, 37, 17}); {3, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 31, 37, 17}, {3, 31, 37, 17}); {3, 31, 37, 17});
} }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> { ...@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
offset->dim_size()); offset->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, nullptr, nullptr, 0, output, future); return functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
} }
......
...@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma, ...@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
} }
} }
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -52,11 +52,11 @@ void Simple() { ...@@ -52,11 +52,11 @@ void Simple() {
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
...@@ -69,11 +69,11 @@ void Simple() { ...@@ -69,11 +69,11 @@ void Simple() {
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -83,10 +83,9 @@ void Simple() { ...@@ -83,10 +83,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
0.8291, 0.8291, 3.1708, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
} }
...@@ -108,29 +107,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -108,29 +107,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -139,25 +134,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -139,25 +134,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -173,29 +168,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -173,29 +168,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -204,26 +195,26 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -204,26 +195,26 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
} }
...@@ -239,29 +230,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -239,29 +230,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -270,24 +257,24 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -270,24 +257,24 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -303,29 +290,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -303,29 +290,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -334,25 +317,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -334,25 +317,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
} }
......
...@@ -23,15 +23,15 @@ ...@@ -23,15 +23,15 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<DeviceType D, class T> template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> { class FullyConnectedOp : public Operator<D, T> {
public: public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_(kernels::StringToActivationType( functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> { ...@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
MACE_CHECK(input->dim(1) == weight->dim(1) MACE_CHECK(
&& input->dim(2) == weight->dim(2) input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
&& input->dim(3) == weight->dim(3) input->dim(3) == weight->dim(3) && weight->dim(0) == bias->dim(0),
&& weight->dim(0) == bias->dim(0), "The shape of Input: ", MakeString(input->shape()),
"The shape of Input: ", "The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
MakeString(input->shape()), bias->dim(0), " don't match.");
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
} else { } else {
MACE_CHECK(input->dim(1) == weight->dim(2) MACE_CHECK(
&& input->dim(2) == weight->dim(3) input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) &&
&& input->dim(3) == weight->dim(1) input->dim(3) == weight->dim(1) && weight->dim(0) == bias->dim(0),
&& weight->dim(0) == bias->dim(0), "The shape of Input: ", MakeString(input->shape()),
"The shape of Input: ", "The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
MakeString(input->shape()), bias->dim(0), " don't match.");
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
} }
return functor_(input, weight, bias, output, future); return functor_(input, weight, bias, output, future);
......
...@@ -24,7 +24,7 @@ namespace test { ...@@ -24,7 +24,7 @@ namespace test {
class FullyConnectedOpTest : public OpsTestBase {}; class FullyConnectedOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple(const std::vector<index_t> &input_shape, void Simple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_value, const std::vector<float> &input_value,
const std::vector<index_t> &weight_shape, const std::vector<index_t> &weight_shape,
...@@ -42,11 +42,11 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -42,11 +42,11 @@ void Simple(const std::vector<index_t> &input_shape,
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
OpDefBuilder("FullyConnected", "FullyConnectedTest") OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input") .Input("Input")
.Input("Weight") .Input("Weight")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
...@@ -59,11 +59,11 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -59,11 +59,11 @@ void Simple(const std::vector<index_t> &input_shape,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FullyConnected", "FullyConnectedTest") OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage") .Input("InputImage")
.Input("WeightImage") .Input("WeightImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -86,14 +86,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPU) { ...@@ -86,14 +86,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPU) {
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206}); {206});
Simple<DeviceType::CPU>( Simple<DeviceType::CPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 1, 2, 5}, {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 1, 2, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::CPU>( Simple<DeviceType::CPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 1, 2, 3}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 1, 2, 3},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6}, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96}); {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
} }
TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
...@@ -103,26 +103,26 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { ...@@ -103,26 +103,26 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
TEST_F(FullyConnectedOpTest, SimpleOPENCL) { TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 2, 2}, Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 2, 2},
{1, 3, 5, 7, 2, 4, 6, 8}, {1}, {2}, {1, 1, 1, 1}, {1, 3, 5, 7, 2, 4, 6, 8}, {1}, {2}, {1, 1, 1, 1},
{206}); {206});
Simple<DeviceType::GPU>( Simple<DeviceType::GPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 5, 1, 2}, {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 5, 1, 2},
{1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 10, 60, 20, 70, 30, 80, 40, 90, 50, 100}, {1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 10, 60, 20, 70, 30, 80, 40, 90, 50, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::GPU>( Simple<DeviceType::GPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2},
{1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6, {1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2,
10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6}, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96}); {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
} }
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 1, 2}, Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 1, 2},
{1, 3, 2, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); {1, 3, 2, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
} }
namespace { namespace {
template<typename T> template <typename T>
void Random(const index_t batch, void Random(const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
...@@ -134,22 +134,20 @@ void Random(const index_t batch, ...@@ -134,22 +134,20 @@ void Random(const index_t batch,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Weight", {out_channel, channels, height, width});
net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, channels, height, width});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}); net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest") OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Weight") .Input("Weight")
.Input("Bias") .Input("Bias")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
...@@ -169,12 +167,12 @@ void Random(const index_t batch, ...@@ -169,12 +167,12 @@ void Random(const index_t batch,
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FullyConnected", "FullyConnectedTest") OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage") .Input("InputImage")
.Input("WeightImage") .Input("WeightImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
...@@ -182,11 +180,11 @@ void Random(const index_t batch, ...@@ -182,11 +180,11 @@ void Random(const index_t batch,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1,
1e-1, 1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-3); 1e-3);
} }
} }
} // namespace } // namespace
......
...@@ -25,8 +25,7 @@ template <DeviceType D, class T> ...@@ -25,8 +25,7 @@ template <DeviceType D, class T>
class LocalResponseNormOp : public Operator<D, T> { class LocalResponseNormOp : public Operator<D, T> {
public: public:
LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws) LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws), functor_() {
functor_() {
depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5); depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f); bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f); alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
...@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> { ...@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> {
input->dim_size()); input->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future); return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
} }
......
...@@ -21,7 +21,7 @@ namespace test { ...@@ -21,7 +21,7 @@ namespace test {
class LocalResponseNormOpTest : public OpsTestBase {}; class LocalResponseNormOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -33,22 +33,22 @@ void Simple() { ...@@ -33,22 +33,22 @@ void Simple() {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest") OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.AddIntArg("depth_radius", 5) .AddIntArg("depth_radius", 5)
.AddFloatArg("bias", 1.0f) .AddFloatArg("bias", 1.0f)
.AddFloatArg("alpha", 1.0f) .AddFloatArg("alpha", 1.0f)
.AddFloatArg("beta", 0.5f) .AddFloatArg("beta", 0.5f)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 1, 2, 6}, {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, {1, 1, 2, 6},
0.34, 0.34, 0.40, 0.40, 0.47, 0.47}); {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0, 1e-2);
} }
......
...@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) { ...@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
TEST_F(MatMulOpTest, SimpleOPENCL) { TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1}, Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
{22, 28, 49, 64});
Simple<DeviceType::GPU>( Simple<DeviceType::GPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
...@@ -127,10 +126,9 @@ void Complex(const index_t batch, ...@@ -127,10 +126,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("A", net.AddRandomInput<DeviceType::GPU, float>("A", {batch, height, channels, 1});
{batch, height, channels, 1}); net.AddRandomInput<DeviceType::GPU, float>("B",
net.AddRandomInput<DeviceType::GPU, float>( {batch, channels, out_width, 1});
"B", {batch, channels, out_width, 1});
// run cpu // run cpu
net.RunOp(); net.RunOp();
...@@ -141,9 +139,9 @@ void Complex(const index_t batch, ...@@ -141,9 +139,9 @@ void Complex(const index_t batch,
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage", BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH); kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage", BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT); kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest") OpDefBuilder("MatMul", "MatMulTest")
.Input("AImage") .Input("AImage")
...@@ -156,13 +154,13 @@ void Complex(const index_t batch, ...@@ -156,13 +154,13 @@ void Complex(const index_t batch,
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT); kernels::BufferType::IN_OUT_HEIGHT);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5,
1e-5, 1e-5); 1e-5);
} }
} }
} // namespace } // namespace
......
...@@ -112,12 +112,12 @@ class OpsTestNet { ...@@ -112,12 +112,12 @@ class OpsTestNet {
public: public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {} OpsTestNet() : op_registry_(new OperatorRegistry()) {}
template<DeviceType D, typename T> template <DeviceType D, typename T>
void AddInputFromArray(const std::string &name, void AddInputFromArray(const std::string &name,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const std::vector<T> &data) { const std::vector<T> &data) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
...@@ -125,25 +125,25 @@ class OpsTestNet { ...@@ -125,25 +125,25 @@ class OpsTestNet {
memcpy(input_data, data.data(), data.size() * sizeof(T)); memcpy(input_data, data.data(), data.size() * sizeof(T));
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void AddRepeatedInput(const std::string &name, void AddRepeatedInput(const std::string &name,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const T data) { const T data) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), data); std::fill(input_data, input_data + input->size(), data);
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void AddRandomInput(const std::string &name, void AddRandomInput(const std::string &name,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
bool positive = true, bool positive = true,
bool truncate = false) { bool truncate = false) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
...@@ -153,15 +153,15 @@ class OpsTestNet { ...@@ -153,15 +153,15 @@ class OpsTestNet {
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate( std::generate(
input_data, input_data + input->size(), input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] { [&gen, &nd, positive, truncate] {
float d = nd(gen); float d = nd(gen);
if (truncate) { if (truncate) {
if (std::abs(d) > 100.f) d = 100.f; if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f; if (std::abs(d) < 0.001f) d = 0.001f;
} }
return half_float::half_cast<half>(positive ?std::abs(d) : d); return half_float::half_cast<half>(positive ? std::abs(d) : d);
}); });
} else { } else {
std::generate(input_data, input_data + input->size(), std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] { [&gen, &nd, positive, truncate] {
...@@ -170,17 +170,15 @@ class OpsTestNet { ...@@ -170,17 +170,15 @@ class OpsTestNet {
if (std::abs(d) > 100.f) d = 100.f; if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f; if (std::abs(d) < 0.001f) d = 0.001f;
} }
return (positive ?std::abs(d) : d); return (positive ? std::abs(d) : d);
}); });
} }
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void Transpose2D(const std::string &src_name, void Transpose2D(const std::string &src_name, const std::string &dst_name) {
const std::string &dst_name) {
Tensor *input = ws_.GetTensor(src_name); Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name, Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
GetDeviceAllocator(D),
DataTypeToEnum<T>::v()); DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 2, "input shape != 2"); MACE_CHECK(input_shape.size() == 2, "input shape != 2");
...@@ -192,19 +190,18 @@ class OpsTestNet { ...@@ -192,19 +190,18 @@ class OpsTestNet {
for (index_t i = 0; i < input_shape[0]; ++i) { for (index_t i = 0; i < input_shape[0]; ++i) {
for (index_t j = 0; j < input_shape[1]; ++j) { for (index_t j = 0; j < input_shape[1]; ++j) {
output_data[j * input_shape[0] + i] = output_data[j * input_shape[0] + i] =
input_data[i * input_shape[1] + j]; input_data[i * input_shape[1] + j];
} }
} }
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TransformDataFormat(const std::string &src_name, void TransformDataFormat(const std::string &src_name,
const DataFormat src_format, const DataFormat src_format,
const std::string &dst_name, const std::string &dst_name,
const DataFormat dst_format) { const DataFormat dst_format) {
Tensor *input = ws_.GetTensor(src_name); Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name, Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
GetDeviceAllocator(D),
DataTypeToEnum<T>::v()); DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4, "input shape != 4"); MACE_CHECK(input_shape.size() == 4, "input shape != 4");
...@@ -224,7 +221,7 @@ class OpsTestNet { ...@@ -224,7 +221,7 @@ class OpsTestNet {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) { for (index_t w = 0; w < width; ++w) {
output_data[((b * channels + c) * height + h) * width + w] = output_data[((b * channels + c) * height + h) * width + w] =
input_data[((b * height + h) * width + w) * channels + c]; input_data[((b * height + h) * width + w) * channels + c];
} }
} }
} }
...@@ -244,7 +241,7 @@ class OpsTestNet { ...@@ -244,7 +241,7 @@ class OpsTestNet {
for (index_t w = 0; w < width; ++w) { for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
output_data[((b * height + h) * width + w) * channels + c] = output_data[((b * height + h) * width + w) * channels + c] =
input_data[((b * channels + c) * height + h) * width + w]; input_data[((b * channels + c) * height + h) * width + w];
} }
} }
} }
...@@ -264,7 +261,7 @@ class OpsTestNet { ...@@ -264,7 +261,7 @@ class OpsTestNet {
for (index_t i = 0; i < oi; ++i) { for (index_t i = 0; i < oi; ++i) {
for (index_t j = 0; j < hw; ++j) { for (index_t j = 0; j < hw; ++j) {
output_data[i * height * width + j] = output_data[i * height * width + j] =
input_data[j * out_channels * in_channels + i]; input_data[j * out_channels * in_channels + i];
} }
} }
} else if (src_format == OIHW && dst_format == HWOI) { } else if (src_format == OIHW && dst_format == HWOI) {
...@@ -282,7 +279,7 @@ class OpsTestNet { ...@@ -282,7 +279,7 @@ class OpsTestNet {
for (index_t i = 0; i < hw; ++i) { for (index_t i = 0; i < hw; ++i) {
for (index_t j = 0; j < oi; ++j) { for (index_t j = 0; j < oi; ++j) {
output_data[i * out_channels * in_channels + j] = output_data[i * out_channels * in_channels + j] =
input_data[j * height * width + i]; input_data[j * height * width + i];
} }
} }
} else if (src_format == HWIO && dst_format == OIHW) { } else if (src_format == HWIO && dst_format == OIHW) {
...@@ -300,7 +297,8 @@ class OpsTestNet { ...@@ -300,7 +297,8 @@ class OpsTestNet {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
for (index_t k = 0; k < hw; ++k) { for (index_t k = 0; k < hw; ++k) {
output_data[((m * in_channels) + c) * height * width + k] = output_data[((m * in_channels) + c) * height * width + k] =
input_data[k * out_channels * in_channels + c * out_channels + m]; input_data[k * out_channels * in_channels + c * out_channels +
m];
} }
} }
} }
...@@ -309,12 +307,11 @@ class OpsTestNet { ...@@ -309,12 +307,11 @@ class OpsTestNet {
} }
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void FillNHWCInputToNCHWInput(const std::string &name_nchw, void FillNHWCInputToNCHWInput(const std::string &name_nchw,
const std::string &name_nhwc) { const std::string &name_nhwc) {
Tensor *input = ws_.GetTensor(name_nhwc); Tensor *input = ws_.GetTensor(name_nhwc);
Tensor *output = ws_.CreateTensor(name_nchw, Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D),
GetDeviceAllocator(D),
DataTypeToEnum<T>::v()); DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
index_t batch = input_shape[0]; index_t batch = input_shape[0];
...@@ -329,7 +326,7 @@ class OpsTestNet { ...@@ -329,7 +326,7 @@ class OpsTestNet {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) { for (index_t w = 0; w < width; ++w) {
output_data[((b * channels + c) * height + h) * width + w] = output_data[((b * channels + c) * height + h) * width + w] =
input_data[((b * height + h) * width + w) * channels + c]; input_data[((b * height + h) * width + w) * channels + c];
} }
} }
} }
...@@ -370,14 +367,12 @@ class OpsTestNet { ...@@ -370,14 +367,12 @@ class OpsTestNet {
// DEPRECATED(liyin): // DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times. // Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark. // Setup time should not be counted during benchmark.
MaceStatus RunOp() { MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
return RunOp(DeviceType::CPU);
}
MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
device_ = device; device_ = device;
net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT); net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
MACE_FAILURE_RETURN(net_->Run()); MACE_RETURN_IF_ERROR(net_->Run());
net_ = CreateNet(op_registry_, net_def, &ws_, device); net_ = CreateNet(op_registry_, net_def, &ws_, device);
return net_->Run(); return net_->Run();
} }
...@@ -415,7 +410,7 @@ class OpsTestBase : public ::testing::Test { ...@@ -415,7 +410,7 @@ class OpsTestBase : public ::testing::Test {
} }
}; };
template<typename T> template <typename T>
void GenerateRandomRealTypeData(const std::vector<index_t> &shape, void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> *res, std::vector<T> *res,
bool positive = true) { bool positive = true) {
...@@ -430,11 +425,10 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape, ...@@ -430,11 +425,10 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
res->resize(size); res->resize(size);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res->begin(), res->end(), std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
[&gen, &nd, positive] { return half_float::half_cast<half>(positive ? std::abs(nd(gen))
return half_float::half_cast<half>( : nd(gen));
positive ? std::abs(nd(gen)) : nd(gen)); });
});
} else { } else {
std::generate(res->begin(), res->end(), [&gen, &nd, positive] { std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
return positive ? std::abs(nd(gen)) : nd(gen); return positive ? std::abs(nd(gen)) : nd(gen);
...@@ -442,7 +436,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape, ...@@ -442,7 +436,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
} }
} }
template<typename T> template <typename T>
void GenerateRandomIntTypeData(const std::vector<index_t> &shape, void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::vector<T> *res, std::vector<T> *res,
const T a = 0, const T a = 0,
...@@ -460,7 +454,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape, ...@@ -460,7 +454,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); }); std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); });
} }
template<typename T> template <typename T>
std::vector<T> VectorStaticCast(const std::vector<float> &&src) { std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
std::vector<T> dest; std::vector<T> dest;
dest.reserve(src.size()); dest.reserve(src.size());
...@@ -470,11 +464,11 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) { ...@@ -470,11 +464,11 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
return std::move(dest); return std::move(dest);
} }
template<typename T> template <typename T>
std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape, std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) { const std::vector<T> &data) {
std::unique_ptr<Tensor> res( std::unique_ptr<Tensor> res(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v())); new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape); res->Resize(shape);
T *input_data = res->mutable_data<T>(); T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T)); memcpy(input_data, data.data(), data.size() * sizeof(T));
...@@ -504,24 +498,24 @@ inline std::string ShapeToString(const Tensor &x) { ...@@ -504,24 +498,24 @@ inline std::string ShapeToString(const Tensor &x) {
return std::string(stream.str()); return std::string(stream.str());
} }
template<typename T> template <typename T>
struct is_floating_point_type { struct is_floating_point_type {
static const bool value = std::is_same<T, float>::value || static const bool value = std::is_same<T, float>::value ||
std::is_same<T, double>::value || std::is_same<T, double>::value ||
std::is_same<T, half>::value; std::is_same<T, half>::value;
}; };
template<typename T> template <typename T>
inline void ExpectEqual(const T &a, const T &b) { inline void ExpectEqual(const T &a, const T &b) {
EXPECT_EQ(a, b); EXPECT_EQ(a, b);
} }
template<> template <>
inline void ExpectEqual<float>(const float &a, const float &b) { inline void ExpectEqual<float>(const float &a, const float &b) {
EXPECT_FLOAT_EQ(a, b); EXPECT_FLOAT_EQ(a, b);
} }
template<> template <>
inline void ExpectEqual<double>(const double &a, const double &b) { inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b); EXPECT_DOUBLE_EQ(a, b);
} }
...@@ -531,13 +525,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) { ...@@ -531,13 +525,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) {
<< "y.shape [ " << ShapeToString(y) << "]"; << "y.shape [ " << ShapeToString(y) << "]";
} }
template<typename EXP_TYPE, template <typename EXP_TYPE,
typename RES_TYPE, typename RES_TYPE,
bool is_fp = is_floating_point_type<EXP_TYPE>::value> bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector; struct Expector;
// Partial specialization for float and double. // Partial specialization for float and double.
template<typename EXP_TYPE, typename RES_TYPE> template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, true> { struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); } static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
...@@ -554,7 +548,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> { ...@@ -554,7 +548,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
} }
} }
static void Near(const Tensor &x, const Tensor &y, static void Near(const Tensor &x,
const Tensor &y,
const double rel_err, const double rel_err,
const double abs_err) { const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
...@@ -588,7 +583,7 @@ struct Expector<EXP_TYPE, RES_TYPE, true> { ...@@ -588,7 +583,7 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
} }
}; };
template<typename EXP_TYPE, typename RES_TYPE> template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, false> { struct Expector<EXP_TYPE, RES_TYPE, false> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); } static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
...@@ -605,7 +600,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> { ...@@ -605,7 +600,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
} }
} }
static void Near(const Tensor &x, const Tensor &y, static void Near(const Tensor &x,
const Tensor &y,
const double rel_err, const double rel_err,
const double abs_err) { const double abs_err) {
MACE_UNUSED(rel_err); MACE_UNUSED(rel_err);
...@@ -614,21 +610,23 @@ struct Expector<EXP_TYPE, RES_TYPE, false> { ...@@ -614,21 +610,23 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
} }
}; };
template<typename T> template <typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, void ExpectTensorNear(const Tensor &x,
const Tensor &y,
const double rel_err = 1e-5, const double rel_err = 1e-5,
const double abs_err = 1e-8) { const double abs_err = 1e-8) {
Expector<T, T>::Near(x, y, rel_err, abs_err); Expector<T, T>::Near(x, y, rel_err, abs_err);
} }
template<typename EXP_TYPE, typename RES_TYPE> template <typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x, const Tensor &y, void ExpectTensorNear(const Tensor &x,
const Tensor &y,
const double rel_err = 1e-5, const double rel_err = 1e-5,
const double abs_err = 1e-8) { const double abs_err = 1e-8) {
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, rel_err, abs_err); Expector<EXP_TYPE, RES_TYPE>::Near(x, y, rel_err, abs_err);
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void BufferToImage(OpsTestNet *net, void BufferToImage(OpsTestNet *net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -636,11 +634,11 @@ void BufferToImage(OpsTestNet *net, ...@@ -636,11 +634,11 @@ void BufferToImage(OpsTestNet *net,
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef()); .Finalize(net->NewOperatorDef());
// Run // Run
net->RunOp(D); net->RunOp(D);
...@@ -648,7 +646,7 @@ void BufferToImage(OpsTestNet *net, ...@@ -648,7 +646,7 @@ void BufferToImage(OpsTestNet *net,
net->Sync(); net->Sync();
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet *net, void ImageToBuffer(OpsTestNet *net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -656,11 +654,11 @@ void ImageToBuffer(OpsTestNet *net, ...@@ -656,11 +654,11 @@ void ImageToBuffer(OpsTestNet *net,
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef()); .Finalize(net->NewOperatorDef());
// Run // Run
net->RunOp(D); net->RunOp(D);
......
...@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> { ...@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> {
PadOp(const OperatorDef &operator_def, Workspace *ws) PadOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_(OperatorBase::GetRepeatedArgs<int>("paddings"), functor_(OperatorBase::GetRepeatedArgs<int>("paddings"),
OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
{}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
......
...@@ -45,9 +45,7 @@ void Simple() { ...@@ -45,9 +45,7 @@ void Simple() {
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NHWC,
"TInput",
NCHW); NCHW);
OpDefBuilder("Pad", "PadTest") OpDefBuilder("Pad", "PadTest")
.Input("TInput") .Input("TInput")
...@@ -59,33 +57,25 @@ void Simple() { ...@@ -59,33 +57,25 @@ void Simple() {
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
} }
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected = CreateTensor<float>({1, 5, 6, 1}, auto expected = CreateTensor<float>(
{ {1, 5, 6, 1}, {
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2,
1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0,
1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, });
1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
});
ExpectTensorNear<float>(*expected, *output, 1e-5); ExpectTensorNear<float>(*expected, *output, 1e-5);
} }
} // namespace } // namespace
TEST_F(PadTest, SimpleCPU) { TEST_F(PadTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
Simple<DeviceType::CPU>();
}
TEST_F(PadTest, SimpleGPU) { TEST_F(PadTest, SimpleGPU) { Simple<DeviceType::GPU>(); }
Simple<DeviceType::GPU>();
}
TEST_F(PadTest, ComplexCPU) { TEST_F(PadTest, ComplexCPU) {
// Construct graph // Construct graph
...@@ -93,9 +83,7 @@ TEST_F(PadTest, ComplexCPU) { ...@@ -93,9 +83,7 @@ TEST_F(PadTest, ComplexCPU) {
// Add input data // Add input data
net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2); net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NHWC,
"TInput",
NCHW); NCHW);
OpDefBuilder("Pad", "PadTest") OpDefBuilder("Pad", "PadTest")
.Input("TInput") .Input("TInput")
...@@ -106,9 +94,7 @@ TEST_F(PadTest, ComplexCPU) { ...@@ -106,9 +94,7 @@ TEST_F(PadTest, ComplexCPU) {
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
...@@ -134,9 +120,7 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -134,9 +120,7 @@ void Complex(const std::vector<index_t> &input_shape,
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape); net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NHWC,
"TInput",
NCHW); NCHW);
OpDefBuilder("Pad", "PadTest") OpDefBuilder("Pad", "PadTest")
.Input("TInput") .Input("TInput")
...@@ -147,16 +131,14 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -147,16 +131,14 @@ void Complex(const std::vector<index_t> &input_shape,
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pad", "PadTest") OpDefBuilder("Pad", "PadTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -168,7 +150,7 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -168,7 +150,7 @@ void Complex(const std::vector<index_t> &input_shape,
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
auto output = net.GetTensor("OpenCLOutput"); auto output = net.GetTensor("OpenCLOutput");
...@@ -181,24 +163,23 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -181,24 +163,23 @@ void Complex(const std::vector<index_t> &input_shape,
} // namespace } // namespace
TEST_F(PadTest, ComplexFloat) { TEST_F(PadTest, ComplexFloat) {
Complex<float>({1, 32, 32, 4}, Complex<float>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0}); {0, 0, 2, 2, 1, 1, 0, 0});
Complex<float>({1, 31, 37, 16}, Complex<float>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0}); {0, 0, 2, 0, 1, 0, 0, 0});
Complex<float>({1, 128, 128, 32}, Complex<float>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0}); {0, 0, 0, 1, 0, 2, 0, 0});
} }
TEST_F(PadTest, ComplexHalf) { TEST_F(PadTest, ComplexHalf) {
Complex<half>({1, 32, 32, 4}, Complex<half>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0}); {0, 0, 2, 2, 1, 1, 0, 0});
Complex<half>({1, 31, 37, 16}, Complex<half>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0}); {0, 0, 2, 0, 1, 0, 0, 0});
Complex<half>({1, 128, 128, 32}, Complex<half>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0}); {0, 0, 0, 1, 0, 2, 0, 0});
} }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -31,36 +31,32 @@ TEST_F(PoolingOpTest, MAX_VALID) { ...@@ -31,36 +31,32 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2}, "Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -73,27 +69,23 @@ TEST_F(PoolingOpTest, MAX_SAME) { ...@@ -73,27 +69,23 @@ TEST_F(PoolingOpTest, MAX_SAME) {
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1}, net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8}); {0, 1, 2, 3, 4, 5, 6, 7, 8});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -108,30 +100,26 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { ...@@ -108,30 +100,26 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 1}, "Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {2, 2}) .AddIntsArg("dilations", {2, 2})
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -146,31 +134,26 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -146,31 +134,26 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 9, 1}, "Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -180,49 +163,45 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -180,49 +163,45 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
} }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void SimpleMaxPooling3S2() { void SimpleMaxPooling3S2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 9, 1}, "Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Run // Run
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -237,12 +216,10 @@ void SimpleMaxPooling3S2() { ...@@ -237,12 +216,10 @@ void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); } TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2<GPU>(); }
SimpleMaxPooling3S2<GPU>();
}
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void MaxPooling3S2(const std::vector<index_t> &input_shape, void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides, const std::vector<int> strides,
Padding padding) { Padding padding) {
...@@ -252,27 +229,23 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -252,27 +229,23 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", input_shape); net.AddRandomInput<D, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
Tensor expected; Tensor expected;
...@@ -281,22 +254,22 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -281,22 +254,22 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-3, 1e-4); 1e-4);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
...@@ -330,63 +303,58 @@ TEST_F(PoolingOpTest, AVG_VALID) { ...@@ -330,63 +303,58 @@ TEST_F(PoolingOpTest, AVG_VALID) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2}, "Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void SimpleAvgPoolingTest() { void SimpleAvgPoolingTest() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1}, "Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
...@@ -399,12 +367,10 @@ void SimpleAvgPoolingTest() { ...@@ -399,12 +367,10 @@ void SimpleAvgPoolingTest() {
} }
} // namespace } // namespace
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest<GPU>(); }
SimpleAvgPoolingTest<GPU>();
}
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void AvgPoolingTest(const std::vector<index_t> &shape, void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels, const std::vector<int> &kernels,
const std::vector<int> &strides, const std::vector<int> &strides,
...@@ -415,27 +381,23 @@ void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -415,27 +381,23 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", shape); net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
Tensor expected; Tensor expected;
...@@ -444,68 +406,60 @@ void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -444,68 +406,60 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-3, 1e-3); 1e-3);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
1e-5);
} }
} }
} // namespace } // namespace
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
Padding::VALID); AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
Padding::VALID);
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) { TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID); Padding::VALID);
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME); Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) { TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID); Padding::VALID);
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME); Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
Padding::VALID); AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
Padding::VALID); AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::SAME);
} }
} // namespace test } // namespace test
......
...@@ -45,17 +45,17 @@ TEST_F(ProposalOpTest, CPUSimple) { ...@@ -45,17 +45,17 @@ TEST_F(ProposalOpTest, CPUSimple) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
std::vector<float> scores(height * width * 18); std::vector<float> scores(height * width * 18);
for (size_t i = 0 ; i < scores.size(); ++i) { for (size_t i = 0; i < scores.size(); ++i) {
scores[i] = i; scores[i] = i;
} }
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>("RpnCLSProb",
"RpnCLSProb", {1, height, width, 18}, scores); {1, height, width, 18}, scores);
net.AddRepeatedInput<DeviceType::CPU, float>( net.AddRepeatedInput<DeviceType::CPU, float>("RpnBBoxPred",
"RpnBBoxPred", {1, height, width, 4 * 9}, 1); {1, height, width, 4 * 9}, 1);
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>("ImgInfo", {1, 1, 1, 3},
"ImgInfo", {1, 1, 1, 3}, {img_height, img_width, 2}); {img_height, img_width, 2});
// Run // Run
net.RunOp(); net.RunOp();
...@@ -65,7 +65,6 @@ TEST_F(ProposalOpTest, CPUSimple) { ...@@ -65,7 +65,6 @@ TEST_F(ProposalOpTest, CPUSimple) {
ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5); ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
} }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -35,9 +35,9 @@ void Register_Dequantize(OperatorRegistry *op_registry) { ...@@ -35,9 +35,9 @@ void Register_Dequantize(OperatorRegistry *op_registry) {
void Register_Requantize(OperatorRegistry *op_registry) { void Register_Requantize(OperatorRegistry *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T") .TypeConstraint<uint8_t>("T")
.Build(), .Build(),
RequantizeOp<DeviceType::CPU, uint8_t>); RequantizeOp<DeviceType::CPU, uint8_t>);
} }
......
...@@ -21,12 +21,11 @@ ...@@ -21,12 +21,11 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<DeviceType D, class T> template <DeviceType D, class T>
class QuantizeOp : public Operator<D, T> { class QuantizeOp : public Operator<D, T> {
public: public:
QuantizeOp(const OperatorDef &operator_def, Workspace *ws) QuantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) { : Operator<D, T>(operator_def, ws) {}
}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -39,9 +38,9 @@ class QuantizeOp : public Operator<D, T> { ...@@ -39,9 +38,9 @@ class QuantizeOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
Tensor *out_min = this->Output(OUT_MIN); Tensor *out_min = this->Output(OUT_MIN);
Tensor *out_max = this->Output(OUT_MAX); Tensor *out_max = this->Output(OUT_MAX);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
MACE_FAILURE_RETURN(out_min->ResizeLike(in_min)); MACE_RETURN_IF_ERROR(out_min->ResizeLike(in_min));
MACE_FAILURE_RETURN(out_max->ResizeLike(in_max)); MACE_RETURN_IF_ERROR(out_max->ResizeLike(in_max));
return functor_(input, in_min, in_max, output, out_min, out_max, future); return functor_(input, in_min, in_max, output, out_min, out_max, future);
} }
...@@ -54,12 +53,11 @@ class QuantizeOp : public Operator<D, T> { ...@@ -54,12 +53,11 @@ class QuantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
}; };
template<DeviceType D, class T> template <DeviceType D, class T>
class DequantizeOp : public Operator<D, T> { class DequantizeOp : public Operator<D, T> {
public: public:
DequantizeOp(const OperatorDef &operator_def, Workspace *ws) DequantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) { : Operator<D, T>(operator_def, ws) {}
}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -70,7 +68,7 @@ class DequantizeOp : public Operator<D, T> { ...@@ -70,7 +68,7 @@ class DequantizeOp : public Operator<D, T> {
MACE_CHECK(in_max->size() == 1, "max val tensor has more than 1 value"); MACE_CHECK(in_max->size() == 1, "max val tensor has more than 1 value");
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, in_min, in_max, output, future); return functor_(input, in_min, in_max, output, future);
} }
...@@ -83,12 +81,11 @@ class DequantizeOp : public Operator<D, T> { ...@@ -83,12 +81,11 @@ class DequantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
template<DeviceType D, class T> template <DeviceType D, class T>
class RequantizeOp : public Operator<D, T> { class RequantizeOp : public Operator<D, T> {
public: public:
RequantizeOp(const OperatorDef &operator_def, Workspace *ws) RequantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) { : Operator<D, T>(operator_def, ws) {}
}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -112,19 +109,12 @@ class RequantizeOp : public Operator<D, T> { ...@@ -112,19 +109,12 @@ class RequantizeOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
Tensor *out_min = this->Output(OUT_MIN); Tensor *out_min = this->Output(OUT_MIN);
Tensor *out_max = this->Output(OUT_MAX); Tensor *out_max = this->Output(OUT_MAX);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
MACE_FAILURE_RETURN(out_min->ResizeLike(in_min)); MACE_RETURN_IF_ERROR(out_min->ResizeLike(in_min));
MACE_FAILURE_RETURN(out_max->ResizeLike(out_max)); MACE_RETURN_IF_ERROR(out_max->ResizeLike(out_max));
return functor_(input, return functor_(input, in_min, in_max, rerange_min, rerange_max, output,
in_min, out_min, out_max, future);
in_max,
rerange_min,
rerange_max,
output,
out_min,
out_max,
future);
} }
private: private:
......
...@@ -26,22 +26,21 @@ TEST_F(QuantizeTest, TestQuantize) { ...@@ -26,22 +26,21 @@ TEST_F(QuantizeTest, TestQuantize) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<CPU, float>("Input", {1, 2, 3, 1}, { net.AddInputFromArray<CPU, float>("Input", {1, 2, 3, 1},
-2, -1, 1, 2, 3, 4 {-2, -1, 1, 2, 3, 4});
});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3}); net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5}); net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Quantize", "QuantizeTest") OpDefBuilder("Quantize", "QuantizeTest")
.Input("Input") .Input("Input")
.Input("InputMin") .Input("InputMin")
.Input("InputMax") .Input("InputMax")
.Output("Output") .Output("Output")
.Output("OutputMin") .Output("OutputMin")
.Output("OutputMax") .Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT}) .OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8) .AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
...@@ -50,10 +49,8 @@ TEST_F(QuantizeTest, TestQuantize) { ...@@ -50,10 +49,8 @@ TEST_F(QuantizeTest, TestQuantize) {
auto output_min = net.GetTensor("OutputMin"); auto output_min = net.GetTensor("OutputMin");
auto output_max = net.GetTensor("OutputMax"); auto output_max = net.GetTensor("OutputMax");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1}, auto expected_output =
{ CreateTensor<uint8_t>({1, 2, 3, 1}, {32, 64, 127, 159, 191, 223});
32, 64, 127, 159, 191, 223
});
auto expected_min = CreateTensor<float>({1}, {-3.01887}); auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5}); auto expected_max = CreateTensor<float>({1}, {5});
...@@ -69,27 +66,25 @@ TEST_F(QuantizeTest, TestQuantizeTrend) { ...@@ -69,27 +66,25 @@ TEST_F(QuantizeTest, TestQuantizeTrend) {
// Add input data // Add input data
net.AddRandomInput<CPU, float>("Input", {100}); net.AddRandomInput<CPU, float>("Input", {100});
const float *input_data = net.GetTensor("Input")->data<float>(); const float *input_data = net.GetTensor("Input")->data<float>();
net.AddInputFromArray<CPU, float>("InputMin", net.AddInputFromArray<CPU, float>(
{1}, "InputMin", {1},
{*std::min_element(input_data, {*std::min_element(input_data,
input_data input_data + net.GetTensor("Input")->size())});
+ net.GetTensor("Input")->size())}); net.AddInputFromArray<CPU, float>(
net.AddInputFromArray<CPU, float>("InputMax", "InputMax", {1},
{1}, {*std::max_element(input_data,
{*std::max_element(input_data, input_data + net.GetTensor("Input")->size())});
input_data
+ net.GetTensor("Input")->size())});
OpDefBuilder("Quantize", "QuantizeTest") OpDefBuilder("Quantize", "QuantizeTest")
.Input("Input") .Input("Input")
.Input("InputMin") .Input("InputMin")
.Input("InputMax") .Input("InputMax")
.Output("Output") .Output("Output")
.Output("OutputMin") .Output("OutputMin")
.Output("OutputMax") .Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT}) .OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8) .AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
...@@ -113,29 +108,26 @@ TEST_F(QuantizeTest, TestDequantize) { ...@@ -113,29 +108,26 @@ TEST_F(QuantizeTest, TestDequantize) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<CPU, uint8_t>("Input", {1, 2, 3, 1}, { net.AddInputFromArray<CPU, uint8_t>("Input", {1, 2, 3, 1},
32, 64, 127, 159, 191, 223 {32, 64, 127, 159, 191, 223});
});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3.01887}); net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3.01887});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5}); net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Dequantize", "DequantizeTest") OpDefBuilder("Dequantize", "DequantizeTest")
.Input("Input") .Input("Input")
.Input("InputMin") .Input("InputMin")
.Input("InputMax") .Input("InputMax")
.Output("Output") .Output("Output")
.OutputType({DT_FLOAT}) .OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8) .AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<float>({1, 2, 3, 1}, auto expected_output =
{ CreateTensor<float>({1, 2, 3, 1}, {-2, -1, 1, 2, 3, 4});
-2, -1, 1, 2, 3, 4
});
auto expected_min = CreateTensor<float>({1}, {-3.01887}); auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5}); auto expected_max = CreateTensor<float>({1}, {5});
...@@ -147,35 +139,33 @@ TEST_F(QuantizeTest, TestRequantizeWithMinMax) { ...@@ -147,35 +139,33 @@ TEST_F(QuantizeTest, TestRequantizeWithMinMax) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<CPU, int>("Input", {1, 2, 3, 1}, { net.AddInputFromArray<CPU, int>(
-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647 "Input", {1, 2, 3, 1},
}); {-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3}); net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5}); net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
net.AddInputFromArray<CPU, float>("RerangeMin", {1}, {-3.01887}); net.AddInputFromArray<CPU, float>("RerangeMin", {1}, {-3.01887});
net.AddInputFromArray<CPU, float>("RerangeMax", {1}, {5}); net.AddInputFromArray<CPU, float>("RerangeMax", {1}, {5});
OpDefBuilder("Requantize", "RequantizeTest") OpDefBuilder("Requantize", "RequantizeTest")
.Input("Input") .Input("Input")
.Input("InputMin") .Input("InputMin")
.Input("InputMax") .Input("InputMax")
.Input("RerangeMin") .Input("RerangeMin")
.Input("RerangeMax") .Input("RerangeMax")
.Output("Output") .Output("Output")
.Output("OutputMin") .Output("OutputMin")
.Output("OutputMax") .Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT}) .OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8) .AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1}, auto expected_output =
{ CreateTensor<uint8_t>({1, 2, 3, 1}, {32, 64, 128, 160, 191, 223});
32, 64, 128, 160, 191, 223
});
auto expected_min = CreateTensor<float>({1}, {-3.01887}); auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5}); auto expected_max = CreateTensor<float>({1}, {5});
...@@ -187,31 +177,29 @@ TEST_F(QuantizeTest, TestRequantizeWithoutMinMax) { ...@@ -187,31 +177,29 @@ TEST_F(QuantizeTest, TestRequantizeWithoutMinMax) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<CPU, int>("Input", {1, 2, 3, 1}, { net.AddInputFromArray<CPU, int>(
-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647 "Input", {1, 2, 3, 1},
}); {-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3}); net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5}); net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Requantize", "RequantizeTest") OpDefBuilder("Requantize", "RequantizeTest")
.Input("Input") .Input("Input")
.Input("InputMin") .Input("InputMin")
.Input("InputMax") .Input("InputMax")
.Output("Output") .Output("Output")
.Output("OutputMin") .Output("OutputMin")
.Output("OutputMax") .Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT}) .OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8) .AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1}, auto expected_output =
{ CreateTensor<uint8_t>({1, 2, 3, 1}, {0, 43, 128, 170, 213, 255});
0, 43, 128, 170, 213, 255
});
auto expected_min = CreateTensor<float>({1}, {-3.01887}); auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5}); auto expected_max = CreateTensor<float>({1}, {5});
ExpectTensorNear<uint8_t>(*expected_output, *output); ExpectTensorNear<uint8_t>(*expected_output, *output);
......
...@@ -26,9 +26,8 @@ class ResizeBilinearOp : public Operator<D, T> { ...@@ -26,9 +26,8 @@ class ResizeBilinearOp : public Operator<D, T> {
public: public:
ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws) ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_( functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}), OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/resize_bilinear.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/resize_bilinear.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -33,22 +33,18 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { ...@@ -33,22 +33,18 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
std::vector<float> input(24); std::vector<float> input(24);
std::iota(begin(input), end(input), 0); std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input); net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("size", {1, 2}) .AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -66,26 +62,21 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { ...@@ -66,26 +62,21 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
std::vector<float> input(24); std::vector<float> input(24);
std::iota(begin(input), end(input), 0); std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input); net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("align_corners", 1) .AddIntArg("align_corners", 1)
.AddIntsArg("size", {1, 2}) .AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
...@@ -111,9 +102,7 @@ void TestRandomResizeBilinear() { ...@@ -111,9 +102,7 @@ void TestRandomResizeBilinear() {
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", net.AddRandomInput<D, float>("Input",
{batch, in_height, in_width, channels}); {batch, in_height, in_width, channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
...@@ -124,10 +113,8 @@ void TestRandomResizeBilinear() { ...@@ -124,10 +113,8 @@ void TestRandomResizeBilinear() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on CPU // Run on CPU
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
...@@ -149,8 +136,8 @@ void TestRandomResizeBilinear() { ...@@ -149,8 +136,8 @@ void TestRandomResizeBilinear() {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
// Check // Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-5, 1e-6); 1e-6);
} }
} }
} // namespace } // namespace
......
...@@ -32,12 +32,12 @@ class SliceOp : public Operator<D, T> { ...@@ -32,12 +32,12 @@ class SliceOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
MACE_CHECK(this->OutputSize() >= 2) MACE_CHECK(this->OutputSize() >= 2)
<< "There must be at least two outputs for slicing"; << "There must be at least two outputs for slicing";
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
const std::vector<Tensor *> output_list = this->Outputs(); const std::vector<Tensor *> output_list = this->Outputs();
const int32_t slice_axis = OperatorBase::GetOptionalArg<int>("axis", 3); const int32_t slice_axis = OperatorBase::GetOptionalArg<int>("axis", 3);
MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0) MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
<< "Outputs do not split input equally."; << "Outputs do not split input equally.";
return functor_(input, output_list, future); return functor_(input, output_list, future);
} }
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <vector> #include <vector>
#include "gmock/gmock.h" #include "gmock/gmock.h"
#include "mace/ops/slice.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/slice.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -26,7 +26,7 @@ namespace test { ...@@ -26,7 +26,7 @@ namespace test {
class SliceOpTest : public OpsTestBase {}; class SliceOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const int num_outputs, const int axis) { void RandomTest(const int num_outputs, const int axis) {
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
const index_t output_channels = 4 * (1 + rand_r(&seed) % 10); const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
...@@ -43,10 +43,8 @@ void RandomTest(const int num_outputs, const int axis) { ...@@ -43,10 +43,8 @@ void RandomTest(const int num_outputs, const int axis) {
input_shape = {batch, input_channels, height, width}; input_shape = {batch, input_channels, height, width};
else if (axis == 3) else if (axis == 3)
input_shape = {batch, height, width, input_channels}; input_shape = {batch, height, width, input_channels};
const index_t input_size = std::accumulate(input_shape.begin(), const index_t input_size = std::accumulate(
input_shape.end(), input_shape.begin(), input_shape.end(), 1, std::multiplies<index_t>());
1,
std::multiplies<index_t>());
std::vector<float> input_data(input_size); std::vector<float> input_data(input_size);
GenerateRandomRealTypeData(input_shape, &input_data); GenerateRandomRealTypeData(input_shape, &input_data);
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
...@@ -60,8 +58,7 @@ void RandomTest(const int num_outputs, const int axis) { ...@@ -60,8 +58,7 @@ void RandomTest(const int num_outputs, const int axis) {
for (int i = 0; i < num_outputs; ++i) { for (int i = 0; i < num_outputs; ++i) {
builder = builder.Output(MakeString("OutputImage", i)); builder = builder.Output(MakeString("OutputImage", i));
} }
builder builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis); auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis);
...@@ -77,8 +74,7 @@ void RandomTest(const int num_outputs, const int axis) { ...@@ -77,8 +74,7 @@ void RandomTest(const int num_outputs, const int axis) {
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
for (int i = 0; i < num_outputs; ++i) { for (int i = 0; i < num_outputs; ++i) {
ImageToBuffer<D, float>(&net, ImageToBuffer<D, float>(&net, MakeString("OutputImage", i),
MakeString("OutputImage", i),
MakeString("Output", i), MakeString("Output", i),
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
...@@ -90,14 +86,12 @@ void RandomTest(const int num_outputs, const int axis) { ...@@ -90,14 +86,12 @@ void RandomTest(const int num_outputs, const int axis) {
expected_shape = {batch, output_channels, height, width}; expected_shape = {batch, output_channels, height, width};
else if (axis == 3) else if (axis == 3)
expected_shape = {batch, height, width, output_channels}; expected_shape = {batch, height, width, output_channels};
const index_t outer_size = std::accumulate(expected_shape.begin(), const index_t outer_size =
expected_shape.begin() + axis, std::accumulate(expected_shape.begin(), expected_shape.begin() + axis, 1,
1, std::multiplies<index_t>());
std::multiplies<index_t>()); const index_t inner_size =
const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1, std::accumulate(expected_shape.begin() + axis + 1, expected_shape.end(),
expected_shape.end(), 1, std::multiplies<index_t>());
1,
std::multiplies<index_t>());
const float *input_ptr = input_data.data(); const float *input_ptr = input_data.data();
const float *output_ptr; const float *output_ptr;
for (int i = 0; i < num_outputs; ++i) { for (int i = 0; i < num_outputs; ++i) {
...@@ -106,11 +100,11 @@ void RandomTest(const int num_outputs, const int axis) { ...@@ -106,11 +100,11 @@ void RandomTest(const int num_outputs, const int axis) {
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
output_ptr = output->data<float>(); output_ptr = output->data<float>();
for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
const int idx = (outer_idx * input_channels + i * output_channels) const int idx =
* inner_size; (outer_idx * input_channels + i * output_channels) * inner_size;
for (int j = 0; j < output_channels * inner_size; ++j) { for (int j = 0; j < output_channels * inner_size; ++j) {
ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output " ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2)
<< i << " index " << idx + j; << "with output " << i << " index " << idx + j;
} }
} }
} }
......
...@@ -31,7 +31,7 @@ class SoftmaxOp : public Operator<D, T> { ...@@ -31,7 +31,7 @@ class SoftmaxOp : public Operator<D, T> {
const Tensor *logits = this->Input(LOGITS); const Tensor *logits = this->Input(LOGITS);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
output->ResizeLike(logits); MACE_RETURN_IF_ERROR(output->ResizeLike(logits));
return functor_(logits, output, future); return functor_(logits, output, future);
} }
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class SoftmaxOpTest : public OpsTestBase {}; class SoftmaxOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -33,9 +33,9 @@ void Simple() { ...@@ -33,9 +33,9 @@ void Simple() {
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -45,9 +45,9 @@ void Simple() { ...@@ -45,9 +45,9 @@ void Simple() {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -60,8 +60,8 @@ void Simple() { ...@@ -60,8 +60,8 @@ void Simple() {
} }
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 1, 2, 4}, {1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -71,7 +71,7 @@ TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); } ...@@ -71,7 +71,7 @@ TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); } TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); }
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) { void Complex(const std::vector<index_t> &logits_shape) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -81,9 +81,9 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -81,9 +81,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
...@@ -97,9 +97,9 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -97,9 +97,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on gpu // Run on gpu
net.RunOp(D); net.RunOp(D);
...@@ -108,8 +108,7 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -108,8 +108,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
1e-5);
} }
} // namespace } // namespace
......
...@@ -29,16 +29,14 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -29,16 +29,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
public: public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_( functor_(OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}), false) {}
false) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *space_tensor = this->Input(INPUT); const Tensor *space_tensor = this->Input(INPUT);
Tensor *batch_tensor = this->Output(OUTPUT); Tensor *batch_tensor = this->Output(OUTPUT);
return functor_(const_cast<Tensor *>(space_tensor), batch_tensor, return functor_(const_cast<Tensor *>(space_tensor), batch_tensor, future);
future);
} }
private: private:
......
...@@ -35,22 +35,20 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape, ...@@ -35,22 +35,20 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("paddings", padding_data) .AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == CPU) { } else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("paddings", padding_data) .AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
// Run // Run
...@@ -60,10 +58,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape, ...@@ -60,10 +58,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) { } else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} }
// Check // Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
...@@ -83,22 +79,20 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape, ...@@ -83,22 +79,20 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("crops", crops_data) .AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == CPU) { } else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("crops", crops_data) .AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
// Run // Run
...@@ -108,10 +102,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape, ...@@ -108,10 +102,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) { } else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} }
// Check // Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
...@@ -124,8 +116,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape, ...@@ -124,8 +116,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
const std::vector<int> &padding_data, const std::vector<int> &padding_data,
const std::vector<index_t> &batch_shape, const std::vector<index_t> &batch_shape,
const std::vector<float> &batch_data) { const std::vector<float> &batch_data) {
auto space_tensor = std::unique_ptr<Tensor>(new Tensor( auto space_tensor = std::unique_ptr<Tensor>(
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v())); new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
space_tensor->Resize(space_shape); space_tensor->Resize(space_shape);
{ {
Tensor::MappingGuard space_mapper(space_tensor.get()); Tensor::MappingGuard space_mapper(space_tensor.get());
...@@ -136,8 +128,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape, ...@@ -136,8 +128,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T)); memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
} }
auto batch_tensor = std::unique_ptr<Tensor>(new Tensor( auto batch_tensor = std::unique_ptr<Tensor>(
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v())); new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
batch_tensor->Resize(batch_shape); batch_tensor->Resize(batch_shape);
{ {
Tensor::MappingGuard batch_mapper(batch_tensor.get()); Tensor::MappingGuard batch_mapper(batch_tensor.get());
...@@ -147,12 +139,12 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape, ...@@ -147,12 +139,12 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
} }
RunSpaceToBatch<DeviceType::GPU>(space_shape, space_data, block_data, RunSpaceToBatch<DeviceType::GPU>(space_shape, space_data, block_data,
padding_data, batch_tensor.get()); padding_data, batch_tensor.get());
RunSpaceToBatch<DeviceType::CPU>(space_shape, space_data, block_data, RunSpaceToBatch<DeviceType::CPU>(space_shape, space_data, block_data,
padding_data, batch_tensor.get()); padding_data, batch_tensor.get());
RunBatchToSpace<DeviceType::GPU>(batch_shape, batch_data, block_data, RunBatchToSpace<DeviceType::GPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get()); padding_data, space_tensor.get());
RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data, RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get()); padding_data, space_tensor.get());
} }
...@@ -209,45 +201,41 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) { ...@@ -209,45 +201,41 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2}, {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24, {1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32}); 9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
} }
void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape, void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data, const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) { const std::vector<int> &padding_data) {
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape); net.AddRandomInput<GPU, float>("Input", input_shape);
// run gpu // run gpu
BufferToImage<GPU, float>(&net, "Input", "InputImage", BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("paddings", padding_data) .AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(GPU); net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU", ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// run cpu // run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("paddings", padding_data) .AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(CPU); net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "OutputCPU", NHWC);
"OutputCPU",
NHWC);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"), ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
...@@ -255,8 +243,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape, ...@@ -255,8 +243,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
} }
void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape, void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data, const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) { const std::vector<int> &crops_data) {
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape); net.AddRandomInput<GPU, float>("Input", input_shape);
...@@ -264,38 +252,33 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape, ...@@ -264,38 +252,33 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
BufferToImage<GPU, float>(&net, "Input", "InputImage", BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("crops", crops_data) .AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(GPU); net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU", ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// run cpu // run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntsArg("crops", crops_data) .AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data) .AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(CPU); net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "OutputCPU", NHWC);
"OutputCPU",
NHWC);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"), ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
*net.GetOutput("OutputGPU")); *net.GetOutput("OutputGPU"));
} }
TEST(SpaceToBatchTest, LargeData) { TEST(SpaceToBatchTest, LargeData) {
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0}); TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4}); TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
......
...@@ -24,20 +24,18 @@ ...@@ -24,20 +24,18 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class SpaceToDepthOp : public Operator<D, T> { class SpaceToDepthOp : public Operator<D, T> {
public: public:
SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws) SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) { functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {}
}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
const int block_size = const int block_size = OperatorBase::GetOptionalArg<int>("block_size", 1);
OperatorBase::GetOptionalArg<int>("block_size", 1);
index_t input_height; index_t input_height;
index_t input_width; index_t input_width;
index_t input_depth; index_t input_depth;
...@@ -55,9 +53,9 @@ class SpaceToDepthOp : public Operator<D, T> { ...@@ -55,9 +53,9 @@ class SpaceToDepthOp : public Operator<D, T> {
MACE_CHECK((input_depth % 4) == 0, MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4"); "input channel should be dividable by 4");
MACE_CHECK( MACE_CHECK(
(input_width % block_size == 0) && (input_height % block_size == 0), (input_width % block_size == 0) && (input_height % block_size == 0),
"input width and height should be dividable by block_size", "input width and height should be dividable by block_size",
input->dim(3)); input->dim(3));
return functor_(input, output, future); return functor_(input, output, future);
} }
......
...@@ -19,9 +19,9 @@ namespace ops { ...@@ -19,9 +19,9 @@ namespace ops {
void Register_Transpose(OperatorRegistry *op_registry) { void Register_Transpose(OperatorRegistry *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
TransposeOp<DeviceType::CPU, float>); TransposeOp<DeviceType::CPU, float>);
} }
......
...@@ -18,31 +18,31 @@ ...@@ -18,31 +18,31 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/transpose.h"
#include "mace/kernels/softmax.h" #include "mace/kernels/softmax.h"
#include "mace/kernels/transpose.h"
namespace mace { namespace mace {
template<DeviceType D, class T> template <DeviceType D, class T>
class TransposeOp : public Operator<D, T> { class TransposeOp : public Operator<D, T> {
public: public:
TransposeOp(const OperatorDef &operator_def, Workspace *ws) TransposeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgs<int>("dims")), dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
functor_(dims_) {} functor_(dims_) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
const std::vector<index_t> &input_shape = input->shape(); const std::vector<index_t> &input_shape = input->shape();
MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) ||
|| (input_shape.size() == 2 && dims_.size() == 2), (input_shape.size() == 2 && dims_.size() == 2),
"rank should be 2 or 4"); "rank should be 2 or 4");
std::vector<index_t> output_shape; std::vector<index_t> output_shape;
for (size_t i = 0; i < dims_.size(); ++i) { for (size_t i = 0; i < dims_.size(); ++i) {
output_shape.push_back(input_shape[dims_[i]]); output_shape.push_back(input_shape[dims_[i]]);
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
return functor_(input, output, future); return functor_(input, output, future);
} }
......
...@@ -29,18 +29,16 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) { ...@@ -29,18 +29,16 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
net.AddRandomInput<CPU, float>("Input", input_shape); net.AddRandomInput<CPU, float>("Input", input_shape);
OpDefBuilder("Transpose", "TransposeNCHWTest") OpDefBuilder("Transpose", "TransposeNCHWTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntsArg("dims", {0, 3, 1, 2}) .AddIntsArg("dims", {0, 3, 1, 2})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>(
DataFormat::NHWC, "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
"InputNCHW",
DataFormat::NCHW);
ExpectTensorNear<float>(*net.GetOutput("InputNCHW"), ExpectTensorNear<float>(*net.GetOutput("InputNCHW"),
*net.GetOutput("Output")); *net.GetOutput("Output"));
...@@ -53,18 +51,16 @@ void TransposeNHWCTest(const std::vector<index_t> &input_shape) { ...@@ -53,18 +51,16 @@ void TransposeNHWCTest(const std::vector<index_t> &input_shape) {
net.AddRandomInput<CPU, float>("Input", input_shape); net.AddRandomInput<CPU, float>("Input", input_shape);
OpDefBuilder("Transpose", "TransposeNHWCTest") OpDefBuilder("Transpose", "TransposeNHWCTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntsArg("dims", {0, 2, 3, 1}) .AddIntsArg("dims", {0, 2, 3, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>(
DataFormat::NCHW, "Input", DataFormat::NCHW, "InputNHWC", DataFormat::NHWC);
"InputNHWC",
DataFormat::NHWC);
ExpectTensorNear<float>(*net.GetOutput("InputNHWC"), ExpectTensorNear<float>(*net.GetOutput("InputNHWC"),
*net.GetOutput("Output")); *net.GetOutput("Output"));
...@@ -91,16 +87,15 @@ TEST_F(TransposeOpTest, Rank2) { ...@@ -91,16 +87,15 @@ TEST_F(TransposeOpTest, Rank2) {
net.AddInputFromArray<CPU, float>("Input", {2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<CPU, float>("Input", {2, 3}, {1, 2, 3, 4, 5, 6});
OpDefBuilder("Transpose", "TransposeNCHWTest") OpDefBuilder("Transpose", "TransposeNCHWTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntsArg("dims", {1, 0}) .AddIntsArg("dims", {1, 0})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
net.AddInputFromArray<CPU, float>("ExpectedOutput", net.AddInputFromArray<CPU, float>("ExpectedOutput", {3, 2},
{3, 2},
{1, 4, 2, 5, 3, 6}); {1, 4, 2, 5, 3, 6});
ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"), ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
......
...@@ -83,7 +83,7 @@ void WinogradConvolution(const index_t batch, ...@@ -83,7 +83,7 @@ void WinogradConvolution(const index_t batch,
// Transfer output // Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput", ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput")); expected.Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape(); auto output_shape = expected.shape();
...@@ -132,34 +132,29 @@ void WinogradConvolution(const index_t batch, ...@@ -132,34 +132,29 @@ void WinogradConvolution(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput", ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
1e-2, 1e-2);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
1e-5, 1e-4);
} }
} }
} // namespace } // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolution) { TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::VALID); Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME);
Padding::SAME);
} }
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37, WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID); Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME);
Padding::SAME);
} }
TEST_F(WinogradConvlutionTest, BatchConvolution) { TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32, WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID); Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME);
Padding::SAME);
} }
namespace { namespace {
...@@ -199,7 +194,7 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -199,7 +194,7 @@ void WinogradConvolutionWithPad(const index_t batch,
// Transfer output // Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput", ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput")); expected.Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape(); auto output_shape = expected.shape();
...@@ -248,34 +243,26 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -248,34 +243,26 @@ void WinogradConvolutionWithPad(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput", ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
1e-2, 1e-2);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
1e-5, 1e-4);
} }
} }
} // namespace } // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) { TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1);
1); WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2);
} }
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) { TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1);
1); WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2);
} }
TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) { TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1);
1); WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2);
} }
} // namespace test } // namespace test
......
...@@ -65,7 +65,7 @@ enum MaceStatus { ...@@ -65,7 +65,7 @@ enum MaceStatus {
MACE_OUT_OF_RESOURCES = 2 MACE_OUT_OF_RESOURCES = 2
}; };
#define MACE_FAILURE_RETURN(stmt) \ #define MACE_RETURN_IF_ERROR(stmt) \
{ \ { \
MaceStatus status = (stmt); \ MaceStatus status = (stmt); \
if (status != MACE_SUCCESS) { \ if (status != MACE_SUCCESS) { \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册