提交 33415ee9 编写于 作者: 李寅

Return mace status for allocate

上级 ccaec70c
...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
} }
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(ws_->LoadModelTensor( MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
*net_def, device_type_, model_data)); *net_def, device_type_, model_data));
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
NetMode::INIT); NetMode::INIT);
MACE_FAILURE_RETURN(net->Run()); MACE_RETURN_IF_ERROR(net->Run());
net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions"); " please use 1 to fill missing dimensions");
Tensor *input_tensor = Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first)); ws_->GetTensor(MakeString("mace_input_node_", input.first));
input_tensor->Resize(input.second.shape()); MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{ {
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>(); float *input_data = input_tensor->mutable_data<float>();
...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]); hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]);
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(net_->Run(run_metadata)); MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
#endif #endif
......
...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
CallStats call_stats; CallStats call_stats;
if (future_wait) { if (future_wait) {
StatsFuture future; StatsFuture future;
MACE_FAILURE_RETURN(op->Run(&future)); MACE_RETURN_IF_ERROR(op->Run(&future));
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
future.wait_fn(&call_stats); future.wait_fn(&call_stats);
} else { } else {
...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} }
} else if (run_metadata != nullptr) { } else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros(); call_stats.start_micros = NowMicros();
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
call_stats.end_micros = NowMicros(); call_stats.end_micros = NowMicros();
} else { } else {
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
} }
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
......
...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} else { } else {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type))); new Buffer(GetDeviceAllocator(type)));
MaceStatus status = tensor_buffer_->Allocate(model_data_size); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
tensor_buffer_->Map(nullptr); tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data), tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size); 0, model_data_size);
...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() >= 20000) { if (mem_block.mem_id() >= 20000) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image()); new Image());
MaceStatus status = image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype); {mem_block.x(), mem_block.y()}, dtype));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} }
...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() < 20000) { if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type))); new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE); + MACE_EXTRA_BUFFER_PAD_SIZE));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
......
...@@ -40,7 +40,7 @@ struct AddNFunctor { ...@@ -40,7 +40,7 @@ struct AddNFunctor {
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensors[0])); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size(); index_t size = output_tensor->size();
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -103,9 +103,9 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr, ...@@ -103,9 +103,9 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
for (int i = 0; i < filter_height; ++i) { for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) { for (int j = 0; j < filter_width; ++j) {
out_ptr[h * out_width + w] out_ptr[h * out_width + w] +=
+= in_ptr[(h * stride + i) * in_width + (w * stride + j)] in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
* filter_ptr[i * filter_width + j]; filter_ptr[i * filter_width + j];
} }
} }
} }
......
...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr, ...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr,
for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
for (int i = 0; i < 15; ++i) { for (int i = 0; i < 15; ++i) {
for (int j = 0; j < 1; ++j) { for (int j = 0; j < 1; ++j) {
out_ptr[io * out_image_size + ih * out_width + w + iw] out_ptr[io * out_image_size + ih * out_width + w + iw] +=
+= in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 1 + j]; filter_ptr[io * in_channels * 15 + i * 1 + j];
} }
} }
} }
} }
} }
// Ho = 4, Wo = 1, Co = 1 // Ho = 4, Wo = 1, Co = 1
void Conv2dNeonK15x1S1(const float *input, void Conv2dNeonK15x1S1(const float *input,
const float *filter, const float *filter,
...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
......
...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input, ...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input,
const index_t out_channels, const index_t out_channels,
float *output) { float *output) {
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
Gemm(filter, Gemm(filter, input + b * in_channels * height * width, 1, out_channels,
input + b * in_channels * height * width, in_channels, height * width,
1,
out_channels,
in_channels,
height * width,
output + b * out_channels * height * width); output + b * out_channels * height * width);
} }
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#endif #endif
#include "mace/kernels/arm/conv_2d_neon.h" #include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr, ...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr,
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 15; ++j) { for (int j = 0; j < 15; ++j) {
out_ptr[io * out_image_size + (h + ih) * out_width + iw] out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
+= in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 15 + j]; filter_ptr[io * in_channels * 15 + i * 15 + j];
} }
} }
} }
} }
} }
// Ho = 1, Wo = 4, Co = 1 // Ho = 1, Wo = 4, Co = 1
void Conv2dNeonK1x15S1(const float *input, void Conv2dNeonK1x15S1(const float *input,
const float *filter, const float *filter,
...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */ /* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -241,9 +237,8 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -241,9 +237,8 @@ void Conv2dNeonK1x7S1(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
in_width, 1, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
......
...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf11 = vld1q_f32(filter_ptr1 + 3); vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6); vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf167 = vld1_f32(filter_ptr1 + 6); vf167 = vld1_f32(filter_ptr1 + 6);
vf189 = vld1_f32(filter_ptr1 + 8); vf189 = vld1_f32(filter_ptr1 + 8);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -313,11 +311,11 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -313,11 +311,11 @@ void Conv2dNeonK3x3S1(const float *input,
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < out_channels; ++mm) {
float float *out_ptr0_base =
*out_ptr0_base = output + b * out_batch_size + mm * out_image_size; output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float const float *in_ptr0 =
*in_ptr0 = input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *in_ptr1 = const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width; input + b * in_batch_size + c * in_image_size + 1 * in_width;
...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -499,9 +495,8 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -499,9 +495,8 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width; out_ptr0 += out_width;
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t out_height = out_shape[2]; const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
*filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size; float *out_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...@@ -656,9 +650,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -656,9 +650,8 @@ void Conv2dNeonK3x3S2(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_base, filter_ptr, Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_base, 2);
out_base, 2);
#endif #endif
} // c } // c
} // m } // m
......
...@@ -205,9 +205,8 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -205,9 +205,8 @@ void Conv2dNeonK5x5S1(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
in_width, 5, 5, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 4 height x 1 width) */ /* load filter (4 outch x 4 height x 1 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width], out_ptr3_base[out_offset + 2 * out_width],
out_ptr3_base[out_offset + 3 * out_width]}; out_ptr3_base[out_offset + 3 * out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// input (3 slide) // input (3 slide)
...@@ -282,9 +277,8 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -282,9 +277,8 @@ void Conv2dNeonK7x1S1(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
in_width, 7, 1, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
......
...@@ -298,9 +298,8 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -298,9 +298,8 @@ void Conv2dNeonK7x7S1(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
...@@ -464,9 +463,8 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -464,9 +463,8 @@ void Conv2dNeonK7x7S2(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 2);
out_ptr0_base, 2);
#endif #endif
} // c } // c
} // mm } // mm
...@@ -630,9 +628,8 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -630,9 +628,8 @@ void Conv2dNeonK7x7S3(const float *input,
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
in_width, 7, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 3);
out_ptr0_base, 3);
#endif #endif
} // c } // c
} // mm } // mm
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/kernels/arm/conv_winograd.h" #include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -49,9 +49,8 @@ void TransformInput4x4(const float *input, ...@@ -49,9 +49,8 @@ void TransformInput4x4(const float *input,
s15; s15;
// load tile data // load tile data
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
d0 = input_ptr[0]; d0 = input_ptr[0];
d1 = input_ptr[1]; d1 = input_ptr[1];
d2 = input_ptr[2]; d2 = input_ptr[2];
...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input, ...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input,
float s[8][8]; float s[8][8];
for (index_t h = 0; h < in_height - 2; h += 6) { for (index_t h = 0; h < in_height - 2; h += 6) {
for (index_t w = 0; w < in_width - 2; w += 6) { for (index_t w = 0; w < in_width - 2; w += 6) {
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -258,25 +256,16 @@ void BatchGemm(const float *input, ...@@ -258,25 +256,16 @@ void BatchGemm(const float *input,
const index_t out_stride = out_channels * tile_count; const index_t out_stride = out_channels * tile_count;
if (batch == 1) { if (batch == 1) {
Gemm(filter, Gemm(filter, input, in_tile_area, out_channels, in_channels, tile_count,
input,
in_tile_area,
out_channels,
in_channels,
tile_count,
output); output);
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int i = 0; i < in_tile_area; ++i) { for (int i = 0; i < in_tile_area; ++i) {
const float const float *in_ptr = input + b * in_batch_size + i * in_stride;
*in_ptr = input + b * in_batch_size + i * in_stride;
const float *filter_ptr = filter + i * filter_stride; const float *filter_ptr = filter + i * filter_stride;
float *out_ptr = output + b * out_batch_size + i * out_stride; float *out_ptr = output + b * out_batch_size + i * out_stride;
Gemm(filter_ptr, Gemm(filter_ptr, in_ptr, 1, out_channels, /* rows */
in_ptr,
1,
out_channels, /* rows */
in_channels, /* K */ in_channels, /* K */
tile_count, /* cols */ tile_count, /* cols */
out_ptr); out_ptr);
...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input, ...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input,
v2 = s2 - s4 - s6; v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7; v3 = s3 - s5 - s7;
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
output_ptr[0] = v0; output_ptr[0] = v0;
output_ptr[1] = v1; output_ptr[1] = v1;
output_ptr[out_width] = v2; output_ptr[out_width] = v2;
...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input, ...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input,
input_ptr += 8 * stride; input_ptr += 8 * stride;
} }
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input, ...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input,
} }
} // namespace } // namespace
// OCHW => TOC // OCHW => TOC
// no need to optimize, it will exist in converter // no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter, void TransformFilter4x4(const float *filter,
...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter, ...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter,
float *output) { float *output) {
const index_t stride = out_channels * in_channels; const index_t stride = out_channels * in_channels;
const float G[8][3] = { const float G[8][3] = {{1.0f, 0.0f, 0.0f},
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9}, {-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9}, {-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45}, {1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f} {0.0f, 0.0f, 1.0f}};
};
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
...@@ -640,55 +624,31 @@ void WinoGradConv3x3s1(const float *input, ...@@ -640,55 +624,31 @@ void WinoGradConv3x3s1(const float *input,
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformInput4x4(input, TransformInput4x4(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
case 6: case 6:
TransformInput8x8(input, TransformInput8x8(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
BatchGemm(transformed_input, BatchGemm(transformed_input, transformed_filter, batch, in_channels,
transformed_filter, out_channels, tile_count, out_tile_size, transformed_output);
batch,
in_channels,
out_channels,
tile_count,
out_tile_size,
transformed_output);
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformOutput4x4(transformed_output, TransformOutput4x4(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
case 6: case 6:
TransformOutput8x8(transformed_output, TransformOutput8x8(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
} }
...@@ -712,8 +672,8 @@ void WinoGradConv3x3s1(const float *input, ...@@ -712,8 +672,8 @@ void WinoGradConv3x3s1(const float *input,
index_t transformed_input_size = index_t transformed_input_size =
in_tile_area * batch * in_channels * tile_count; in_tile_area * batch * in_channels * tile_count;
index_t transformed_filter_size = in_tile_area * out_channels * in_channels; index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
index_t index_t transformed_output_size =
transformed_output_size = in_tile_area * batch * out_channels * tile_count; in_tile_area * batch * out_channels * tile_count;
float *transformed_input = new float[transformed_input_size]; // TNCB float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_filter = new float[transformed_filter_size]; // TOC float *transformed_filter = new float[transformed_filter_size]; // TOC
...@@ -721,35 +681,22 @@ void WinoGradConv3x3s1(const float *input, ...@@ -721,35 +681,22 @@ void WinoGradConv3x3s1(const float *input,
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformFilter4x4(filter, TransformFilter4x4(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
case 6: case 6:
TransformFilter8x8(filter, TransformFilter8x8(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
} MACE_NOT_IMPLEMENTED;
}
WinoGradConv3x3s1(input,
transformed_filter, WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width,
batch, in_channels, out_channels, out_tile_size, transformed_input,
in_height, transformed_output, output);
in_width,
in_channels,
out_channels,
out_tile_size,
transformed_input,
transformed_output,
output);
delete[]transformed_input; delete[] transformed_input;
delete[]transformed_filter; delete[] transformed_filter;
delete[]transformed_output; delete[] transformed_output;
} }
void ConvRef3x3s1(const float *input, void ConvRef3x3s1(const float *input,
...@@ -778,10 +725,9 @@ void ConvRef3x3s1(const float *input, ...@@ -778,10 +725,9 @@ void ConvRef3x3s1(const float *input,
index_t iw = w + kw; index_t iw = w + kw;
index_t in_offset = index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw; ((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t index_t filter_offset =
filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; (((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] += output[out_offset] += input[in_offset] * filter[filter_offset];
input[in_offset] * filter[filter_offset];
} }
} }
} }
......
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <random>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <random>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/arm/conv_winograd.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) { ...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
std::generate(input_data, input_data + input_size, std::generate(input_data, input_data + input_size, [&gen, &nd] {
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen))); return std::max(-1.0f, std::min(1.0f, nd(gen)));
}); });
std::generate(filter_data, filter_data + filter_size, std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen))); return std::max(-1.0f, std::min(1.0f, nd(gen)));
}); });
kernels::ConvRef3x3s1(input_data, kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
filter_data, in_channels, out_channels, output_data_ref);
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data_ref);
kernels::WinoGradConv3x3s1(input_data, kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
filter_data, in_width, in_channels, out_channels, 6,
batch,
in_height,
in_width,
in_channels,
out_channels,
6,
output_data); output_data);
// test // test
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, ...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1 // Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_shape[3]; ++iw) { for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih - pad_top,
filter_ptr, iw - pad_left, out_width, in_height, in_width, 3,
ih, 3, out_base);
iw,
ih - pad_top,
iw - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -275,18 +221,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -275,18 +221,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
...@@ -295,9 +232,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -295,9 +232,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
for (h = valid_h_start; h < valid_h_stop; ++h) { for (h = valid_h_start; h < valid_h_stop; ++h) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih * 2 - pad_top,
filter_ptr, iw * 2 - pad_left, out_width, in_height,
ih, in_width, 3, 3, out_base);
iw,
ih * 2 - pad_top,
iw * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -471,18 +372,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -471,18 +372,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
......
...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor { ...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
outer_sizes[i] = input->size() / inner_size; outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t channels = output->dim(1);
...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if (is_filter_transformed_) { if (is_filter_transformed_) {
transformed_filter_ptr = filter_data; transformed_filter_ptr = filter_data;
} else { } else {
MACE_FAILURE_RETURN(transformed_filter_.Resize( MACE_RETURN_IF_ERROR(transformed_filter_.Resize(
transformed_filter_shape)); transformed_filter_shape));
switch (winograd_out_tile_size) { switch (winograd_out_tile_size) {
case 2: case 2:
...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const Tensor *pad_input_ptr = input; const Tensor *pad_input_ptr = input;
if (extra_input_height != input_height if (extra_input_height != input_height
|| extra_input_width != input_width) { || extra_input_width != input_width) {
MACE_FAILURE_RETURN(ConstructNCHWInputWithSpecificPadding(input, MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input,
pad_top, pad_top,
pad_bottom, pad_bottom,
pad_left, pad_left,
......
...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, ...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, ...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const int pad_width = pad_left + pad_right; const int pad_width = pad_left + pad_right;
std::vector<index_t> output_shape( std::vector<index_t> output_shape(
{batch, channels, height + pad_height, width + pad_width}); {batch, channels, height + pad_height, width + pad_width});
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
output_tensor->Clear(); output_tensor->Clear();
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, ...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, padding_type_, strides_, padding_type_,
output_shape.data(), output_shape.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, strides_,
output_shape_.data(), output_shape_.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape_)); MACE_RETURN_IF_ERROR(output->Resize(output_shape_));
} }
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
......
...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor { ...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor {
std::vector<index_t> output_shape = {batch_size, output_depth, std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width}; output_height, output_width};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
output->Clear(); output->Clear();
index_t batch = output->dim(0); index_t batch = output->dim(0);
......
...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase { ...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
} }
} }
} }
MACE_FAILURE_RETURN(output->ResizeLike(input0)); MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0); Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { ...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1}; std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t N = output->dim(0); const index_t N = output->dim(0);
const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
const index_t output_size = weight->dim(0); const index_t output_size = weight->dim(0);
......
...@@ -44,7 +44,7 @@ struct MatMulFunctor { ...@@ -44,7 +44,7 @@ struct MatMulFunctor {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
MACE_FAILURE_RETURN(C->Resize(c_shape)); MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A); Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B); Tensor::MappingGuard guardb(B);
......
...@@ -21,9 +21,9 @@ ...@@ -21,9 +21,9 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
T>::operator()(const Tensor *input, const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
switch (activation_) { switch (activation_) {
case RELU:tuning_key_prefix_ = "relu_opencl_kernel"; case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:tuning_key_prefix_ = "relux_opencl_kernel"; case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU:tuning_key_prefix_ = "prelu_opencl_kernel"; case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH:tuning_key_prefix_ = "tanh_opencl_kernel"; case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID:tuning_key_prefix_ = "sigmoid_opencl_kernel"; case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation_; default:
LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ActivationFunctor<DeviceType::GPU, float>;
struct ActivationFunctor<DeviceType::GPU, float>; template struct ActivationFunctor<DeviceType::GPU, half>;
template
struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -87,8 +87,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,8 +87,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -23,7 +23,8 @@ namespace mace { ...@@ -23,7 +23,8 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
const Tensor *mean, const Tensor *mean,
...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
......
...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) { for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
roundup_gws[i] = RoundUp(gws[i], lws[i]);
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
......
...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *image, Tensor *image,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape); CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type); std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
MACE_FAILURE_RETURN(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_FAILURE_RETURN(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
} }
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -16,18 +16,16 @@ ...@@ -16,18 +16,16 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor *output, MACE_RETURN_IF_ERROR(output->ResizeLike(input));
StatsFuture *future) {
MACE_FAILURE_RETURN(output->ResizeLike(input));
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const index_t channels_per_group = channels / groups_; const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0, MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4"); "channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
"groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ =
built_options); runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
struct ChannelShuffleFunctor<DeviceType::GPU, float>; template struct ChannelShuffleFunctor<DeviceType::GPU, half>;
template
struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,11 +22,9 @@ namespace mace { ...@@ -22,11 +22,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
...@@ -37,8 +35,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -37,8 +35,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static MaceStatus Concat2(cl::Kernel *kernel,
static void Concat2(cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt, const DataType dt,
...@@ -68,7 +65,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -68,7 +65,7 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,9 +122,11 @@ static void Concat2(cl::Kernel *kernel, ...@@ -125,9 +122,11 @@ static void Concat2(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
static void ConcatN(cl::Kernel *kernel, static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()( ...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, return Concat2(&kernel_, input_list[0], input_list[1],
&input_shape_, output, future, &kwg_size_, &kernel_error_); DataTypeToEnum<T>::value, &input_shape_, output, future,
break; &kwg_size_, &kernel_error_);
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future, return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
&kwg_size_, &kernel_error_); future, &kwg_size_, &kernel_error_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -34,7 +34,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -34,7 +34,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -50,7 +50,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -50,7 +50,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -72,7 +72,7 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -72,7 +72,7 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)( typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), return conv2d_func(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), return Conv2dOpencl(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} }
return MACE_SUCCESS;
} }
template struct Conv2dFunctor<DeviceType::GPU, float>; template struct Conv2dFunctor<DeviceType::GPU, float>;
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -25,11 +25,9 @@ namespace { ...@@ -25,11 +25,9 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value. // TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128; const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
} }
...@@ -57,7 +54,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -57,7 +54,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -24,22 +24,20 @@ namespace kernels { ...@@ -24,22 +24,20 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>( uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1); OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, const uint32_t base =
4); std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base), lws[0] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size / RoundUp<uint32_t>(
lws_size / compute_units, base), cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
...@@ -50,7 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -50,7 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size, const uint32_t kernel_size,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -41,9 +40,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -41,9 +40,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units) lws_size / compute_units) *
* 8, 8,
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
if (gws[2] < lws_limit) { if (gws[2] < lws_limit) {
...@@ -58,7 +57,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -58,7 +57,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -168,9 +167,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -168,9 +167,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel", output->dim(0), Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3), output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -20,7 +20,7 @@ namespace kernels { ...@@ -20,7 +20,7 @@ namespace kernels {
namespace { namespace {
void Deconv2dOpencl(cl::Kernel *kernel, MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
#define MACE_WIDTH_BLK 5 #define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride; const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks = const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride; ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride); const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1; const int padding_h = (paddings[0] + 1) >> 1;
const int padding_w = (paddings[0]+1) >> 1; const int padding_w = (paddings[0] + 1) >> 1;
const int align_h = stride - 1 - padding_h; const int align_h = stride - 1 - padding_h;
const int align_w = stride - 1 - padding_w; const int align_w = stride - 1 - padding_w;
...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel,
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP:break; case NOOP:
case RELU:built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:built_options.emplace("-DUSE_RELUX"); case RELU:
built_options.emplace("-DUSE_RELU");
break; break;
case TANH:built_options.emplace("-DUSE_TANH"); case RELUX:
built_options.emplace("-DUSE_RELUX");
break; break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID"); case TANH:
built_options.emplace("-DUSE_TANH");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation; case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
...@@ -150,12 +156,15 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -150,12 +156,15 @@ void Deconv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
template <typename T> template <typename T>
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (output_shape_.size() == 4) { if (output_shape_.size() == 4) {
paddings_.clear(); paddings_.clear();
paddings_ = std::vector<int>(2, 0); paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize( CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
input->shape().data(), strides_, padding_type_, output_shape_.data(),
filter->shape().data(),
strides_, padding_type_,
output_shape_.data(),
paddings_.data()); paddings_.data());
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(), CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
filter->shape().data(), strides_, output_shape_.data(), paddings_.data());
strides_,
output_shape_.data(),
paddings_.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape_, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape_, output_image_shape));
Deconv2dOpencl(&kernel_, input, filter, bias,
strides_[0], paddings_.data(),
activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS; return Deconv2dOpencl(&kernel_, input, filter, bias, strides_[0],
paddings_.data(), activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
} }
template struct Deconv2dFunctor<DeviceType::GPU, float>; template struct Deconv2dFunctor<DeviceType::GPU, float>;
......
...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
runtime->BuildKernel("depth_to_space", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......
...@@ -24,8 +24,7 @@ namespace kernels { ...@@ -24,8 +24,7 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize; uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
...@@ -40,8 +39,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -40,8 +39,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
} }
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
(cache_size / kernel_cache_size / lws_size) * 4,
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = gws[2]; lws[2] = gws[2];
...@@ -52,7 +50,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -52,7 +50,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static void DepthwiseConv2d(cl::Kernel *kernel, static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
const Tensor *bias, const Tensor *bias,
...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", std::string tuning_key =
gws[0], gws[1], gws[2], multiplier); Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) { if (strides_[0] != strides_[1]) {
...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), return DepthwiseConv2d(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
} }
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>; template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
......
...@@ -28,9 +28,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -28,9 +28,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
MACE_UNUSED(future); MACE_UNUSED(future);
bool swapped = false; bool swapped = false;
if (input1 != nullptr) { if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size() MACE_CHECK(input0->dim_size() == input1->dim_size() ||
|| input0->dim_size() == 1 input0->dim_size() == 1 || input1->dim_size() == 1)
|| input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape"; << "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) { if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) { if (input0->size() < input1->size()) {
...@@ -42,25 +41,23 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -42,25 +41,23 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
<< "Element-Wise op only support channel dimension broadcast"; << "Element-Wise op only support channel dimension broadcast";
} else { } else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) && MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(1) == 1 &&
input1->dim(2) == 1) input1->dim(2) == 1)
<< "Element-Wise op only support channel dimension broadcast"; << "Element-Wise op only support channel dimension broadcast";
} }
} }
} }
std::vector<index_t > output_shape(4); std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0); output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1); output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2); output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3); output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -20,7 +20,7 @@ namespace kernels { ...@@ -20,7 +20,7 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T>
void FCWXKernel(cl::Kernel *kernel, MaceStatus FCWXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -170,10 +170,12 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -170,10 +170,12 @@ void FCWXKernel(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
void FCWTXKernel(cl::Kernel *kernel, MaceStatus FCWTXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t kwg_size = uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
*lws = {16, kwg_size/16, 0}; *lws = {16, kwg_size / 16, 0};
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(1), output->dim(2), output->dim(3)); output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( ...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future, activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_); &kernel_error_);
return MACE_SUCCESS;
} }
template struct FullyConnectedFunctor<DeviceType::GPU, float>; template struct FullyConnectedFunctor<DeviceType::GPU, float>;
......
...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size); lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws; return lws;
...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 4) MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+3); std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3]; uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<std::vector<uint32_t>> results; std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = { std::vector<std::vector<uint32_t>> candidates = {
{kwg_size / 2, 2, 0}, {kwg_size / 2, 2, 0}, {kwg_size / 4, 4, 0},
{kwg_size / 4, 4, 0}, {kwg_size / 8, 8, 0}, {kwg_size / 16, 16, 0},
{kwg_size / 8, 8, 0}, {kwg_size / 32, 32, 0}, {kwg_size / 64, 64, 0},
{kwg_size / 16, 16, 0}, {kwg_size / 128, 128, 0}, {kwg_size / 256, 256, 0},
{kwg_size / 32, 32, 0}, {kwg_size, 1, 0}, {1, kwg_size, 0}};
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}
};
for (auto &ele : candidates) { for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2]; const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) { if (0 < tmp && tmp <= kwg_size) {
...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 3) MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+2); std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2]; uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -88,8 +88,7 @@ inline bool LimitKernelTime() { ...@@ -88,8 +88,7 @@ inline bool LimitKernelTime() {
} }
template <typename T> template <typename T>
bool IsVecEqual(const std::vector<T> &input0, bool IsVecEqual(const std::vector<T> &input0, const std::vector<T> &input1) {
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) && return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin()))); (std::equal(input0.begin(), input0.end(), input1.begin())));
} }
......
...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *buffer, Tensor *buffer,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape); CalImage2DShape(image->shape(), type, &image_shape);
MACE_FAILURE_RETURN(buffer->Resize(image->shape())); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
} }
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -29,7 +29,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -29,7 +29,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_FAILURE_RETURN(C->ResizeImage(c_shape, c_image_shape)); MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t batch = C->dim(0); const index_t batch = C->dim(0);
const index_t height = C->dim(1); const index_t height = C->dim(1);
...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
Concat("matmul_opencl_kernel", C->dim(0), C->dim(2), C->dim(3));
C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>( kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0; *(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap(); kernel_error->UnMap();
...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
bool is_out_of_range = false; bool is_out_of_range = false;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
is_out_of_range = is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false;
*(kernel_error->mutable_data<char>()) == 1 ? true : false;
kernel_error->UnMap(); kernel_error->UnMap();
} }
return is_out_of_range; return is_out_of_range;
...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test { class OutOfRangeCheckTest : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() { setenv("OUT_OF_RANGE_CHECK", "1", 1); }
setenv("OUT_OF_RANGE_CHECK", "1", 1);
}
}; };
TEST(OutOfRangeCheckTest, RandomTest) { TEST(OutOfRangeCheckTest, RandomTest) {
...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) { ...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels}; std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws; Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer", Tensor *buffer =
GetDeviceAllocator(DeviceType::GPU), ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape); buffer->Resize(buffer_shape);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
......
...@@ -20,26 +20,25 @@ ...@@ -20,26 +20,25 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK( MACE_CHECK(this->paddings_.size() ==
this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2))); static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
&& (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now"; << "Mace only support height/width dimension now";
auto input_shape = input->shape(); auto input_shape = input->shape();
std::vector<index_t> std::vector<index_t> output_shape = {
output_shape = {input_shape[0] + this->paddings_[0] + this->paddings_[1], input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3], input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5], input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]}; input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(2), output->dim(3));
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct PadFunctor<DeviceType::GPU, float>;
struct PadFunctor<DeviceType::GPU, float>; template struct PadFunctor<DeviceType::GPU, half>;
template
struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,15 +23,13 @@ namespace kernels { ...@@ -23,15 +23,13 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4; lws[0] = gws[0] / 4;
if (lws[0] == 0) { if (lws[0] == 0) {
...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t out_height = output->dim(1); index_t out_height = output->dim(1);
...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -23,11 +23,9 @@ namespace mace { ...@@ -23,11 +23,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
...@@ -30,13 +30,14 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -30,13 +30,14 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const index_t output_channels = input_channels / outputs_count; const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0) MACE_CHECK(output_channels % 4 == 0)
<< "output channels of slice op must be divisible by 4"; << "output channels of slice op must be divisible by 4";
std::vector<index_t> output_shape({input->dim(0), input->dim(1), std::vector<index_t> output_shape(
input->dim(2), output_channels}); {input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i = 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE=" +
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blk = RoundUpDiv4(output_channels); const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct SliceFunctor<DeviceType::GPU, float>;
struct SliceFunctor<DeviceType::GPU, float>; template struct SliceFunctor<DeviceType::GPU, half>;
template
struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -24,10 +24,8 @@ namespace kernels { ...@@ -24,10 +24,8 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) { uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0), Concat("softmax_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -26,17 +26,13 @@ namespace kernels { ...@@ -26,17 +26,13 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
Tensor *batch_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
if (b2s_) { if (b2s_) {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} }
...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
if (b2s_) { if (b2s_) {
MACE_FAILURE_RETURN(space_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); space_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "batch_to_space"; kernel_name = "batch_to_space";
} else { } else {
MACE_FAILURE_RETURN(batch_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); batch_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "space_to_batch"; kernel_name = "space_to_batch";
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
runtime->BuildKernel("space_to_batch", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......
...@@ -24,7 +24,6 @@ namespace kernels { ...@@ -24,7 +24,6 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -78,7 +77,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -78,7 +77,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key = Concat("winograd_transform_kernel",
Concat("winograd_transform_kernel", output_tensor->dim(0), output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(2), output_tensor->dim(3));
output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -188,7 +185,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -188,7 +185,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
input_tensor->dim(1)}; input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
......
...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase { ...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase {
MACE_CHECK( MACE_CHECK(
this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2); this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
auto input_shape = input->shape(); auto input_shape = input->shape();
MACE_FAILURE_RETURN(output->Resize({input_shape[0] + this->paddings_[0] MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
+ this->paddings_[1], + this->paddings_[1],
input_shape[1] + this->paddings_[2] input_shape[1] + this->paddings_[2]
+ this->paddings_[3], + this->paddings_[3],
......
...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
RoundType::CEIL, RoundType::CEIL,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor); Tensor::MappingGuard output_guard(output_tensor);
......
...@@ -267,7 +267,7 @@ struct ProposalFunctor { ...@@ -267,7 +267,7 @@ struct ProposalFunctor {
// Our RPN implementation only supports a single input image, so all // Our RPN implementation only supports a single input image, so all
// batch inds are 0 // batch inds are 0
size = static_cast<int>(nms_result.size()); size = static_cast<int>(nms_result.size());
MACE_FAILURE_RETURN(output->Resize({size, 1, 1, 5})); MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5}));
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<float>();
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
......
...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor { ...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor {
const index_t num_rois = rois->dim(0); const index_t num_rois = rois->dim(0);
const index_t batch_size = input->dim(0); const index_t batch_size = input->dim(0);
MACE_FAILURE_RETURN(output->Resize({num_rois, pooled_height, pooled_width, MACE_RETURN_IF_ERROR(output->Resize({num_rois, pooled_height, pooled_width,
output_dim_})); output_dim_}));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
index_t out_width = out_width_; index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width}; std::vector<index_t> out_shape{batch, channels, out_height, out_width};
MACE_FAILURE_RETURN(output->Resize(out_shape)); MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
......
...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase { ...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
1, 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i= 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape));
output_ptrs[i] = output_list[i]->mutable_data<T>(); output_ptrs[i] = output_list[i]->mutable_data<T>();
} }
const T *input_ptr = input->data<T>(); const T *input_ptr = input->data<T>();
......
...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(space_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(batch_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
} }
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(space_tensor);
......
...@@ -15,7 +15,6 @@ cc_library( ...@@ -15,7 +15,6 @@ cc_library(
hdrs = [ hdrs = [
"ops_test_util.h", "ops_test_util.h",
], ],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
deps = [ deps = [
"//mace/core", "//mace/core",
"@gtest//:gtest", "@gtest//:gtest",
...@@ -36,18 +35,23 @@ cc_library( ...@@ -36,18 +35,23 @@ cc_library(
[ [
"buffer_to_image.cc", "buffer_to_image.cc",
"image_to_buffer.cc", "image_to_buffer.cc",
]), ],
),
hdrs = glob( hdrs = glob(
["*.h"], ["*.h"],
exclude = ["ops_test_util.h"], exclude = ["ops_test_util.h"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [ deps = [
"//mace/kernels", "//mace/kernels",
], ],
...@@ -60,13 +64,17 @@ cc_test( ...@@ -60,13 +64,17 @@ cc_test(
srcs = glob( srcs = glob(
["*_test.cc"], ["*_test.cc"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
...@@ -80,13 +88,17 @@ cc_test( ...@@ -80,13 +88,17 @@ cc_test(
name = "ops_benchmark", name = "ops_benchmark",
testonly = 1, testonly = 1,
srcs = glob(["*_benchmark.cc"]), srcs = glob(["*_benchmark.cc"]),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
......
...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> { ...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> {
functor_(kernels::StringToActivationType( functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
static_cast<T>(OperatorBase::GetOptionalArg<float>( static_cast<T>(
"max_limit", 0.0f))) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr; this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->Output(0); Tensor *output_tensor = this->Output(0);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensor)); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor));
return functor_(input_tensor, alpha_tensor, output_tensor, future); return functor_(input_tensor, alpha_tensor, output_tensor, future);
} }
......
...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { ...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::GPU>(); TestUnalignedSimpleRelu<DeviceType::GPU>();
} }
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void TestSimpleRelux() { void TestSimpleRelux() {
...@@ -169,9 +168,7 @@ void TestSimpleRelux() { ...@@ -169,9 +168,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) { TEST_F(ActivationOpTest, OPENCLSimple) { TestSimpleRelux<DeviceType::GPU>(); }
TestSimpleRelux<DeviceType::GPU>();
}
namespace { namespace {
template <DeviceType D> template <DeviceType D>
...@@ -278,9 +275,7 @@ void TestSimplePrelu() { ...@@ -278,9 +275,7 @@ void TestSimplePrelu() {
} }
} // namespace } // namespace
TEST_F(ActivationOpTest, CPUSimplePrelu) { TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
TestSimplePrelu<DeviceType::CPU>();
}
TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::GPU>(); TestSimplePrelu<DeviceType::GPU>();
......
...@@ -97,8 +97,8 @@ void SimpleAdd3() { ...@@ -97,8 +97,8 @@ void SimpleAdd3() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 2, 3, 1}, auto expected =
{-0.000713, 8, 12, 16, 20, 24}); CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
} }
...@@ -160,8 +160,8 @@ void RandomTest() { ...@@ -160,8 +160,8 @@ void RandomTest() {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-2); 1e-2);
} }
} }
} // namespace } // namespace
......
...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> { ...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> {
var->dim_size()); var->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, mean, var, epsilon_, output, future); return functor_(input, scale, offset, mean, var, epsilon_, output, future);
} }
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class BatchNormOpTest : public OpsTestBase {}; class BatchNormOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -79,10 +79,9 @@ void Simple() { ...@@ -79,10 +79,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
0.8291, 0.8291, 3.1708, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
} }
...@@ -103,16 +102,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,16 +102,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -129,9 +126,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -129,9 +126,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -186,16 +181,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -186,16 +181,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -211,9 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -211,9 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -269,16 +260,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -269,16 +260,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -294,9 +283,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -294,9 +283,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -351,16 +338,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -351,16 +338,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -376,9 +361,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -376,9 +361,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
......
...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
Tensor *space_tensor = this->Output(OUTPUT); Tensor *space_tensor = this->Output(OUTPUT);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future);
future);
} }
private: private:
......
...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> { ...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> {
bias->dim_size()); bias->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, bias, output, future); return functor_(input, bias, output, future);
} }
......
...@@ -32,9 +32,7 @@ void BiasAddSimple() { ...@@ -32,9 +32,7 @@ void BiasAddSimple() {
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -43,10 +41,8 @@ void BiasAddSimple() { ...@@ -43,10 +41,8 @@ void BiasAddSimple() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -79,9 +75,7 @@ void BiasAddSimple() { ...@@ -79,9 +75,7 @@ void BiasAddSimple() {
TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); } TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { BiasAddSimple<DeviceType::GPU>(); }
BiasAddSimple<DeviceType::GPU>();
}
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// generate random input // generate random input
...@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
Tensor expected; Tensor expected;
......
...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::GPU, half>( TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
kernels::ARGUMENT, {2}, input_data); {2}, input_data);
} }
} // namespace test } // namespace test
......
...@@ -29,9 +29,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { ...@@ -29,9 +29,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input", {1, 1, 2, 8}, "Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
...@@ -43,9 +41,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { ...@@ -43,9 +41,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <functional> #include <functional>
#include <string>
#include "gmock/gmock.h" #include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h" #include "mace/ops/concat.h"
#include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
int concat_axis_size = 0; int concat_axis_size = 0;
// Construct graph // Construct graph
std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>()); std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
std::vector<const float*> input_ptrs(num_inputs); std::vector<const float *> input_ptrs(num_inputs);
OpsTestNet net; OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i); const std::string input_name = MakeString("Input", i);
...@@ -171,8 +171,8 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -171,8 +171,8 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]); GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name, net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
shapes[i], inputs[i]); inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name, BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
......
此差异已折叠。
...@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> { ...@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>( padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))), "padding", static_cast<int>(SAME)))),
paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")), paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")),
dilations_( dilations_(OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
protected: protected:
std::vector<int> strides_; std::vector<int> strides_;
......
...@@ -31,8 +31,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -31,8 +31,7 @@ TEST(CoreTest, INIT_MODE) {
.AddIntArg("mode", static_cast<int>(NetMode::INIT)) .AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3}); input->Resize({1, 3, 3, 3});
{ {
......
此差异已折叠。
...@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s, ...@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s,
const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest"; const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
// Construct graph // Construct graph
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW") .Input("InputNCHW")
...@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s, ...@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else { } else {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
...@@ -64,7 +60,6 @@ void RunDepthToSpace(const bool d2s, ...@@ -64,7 +60,6 @@ void RunDepthToSpace(const bool d2s,
net.RunOp(D); net.RunOp(D);
} }
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -74,108 +69,89 @@ void RunDepthToSpace(const bool d2s, ...@@ -74,108 +69,89 @@ void RunDepthToSpace(const bool d2s,
} }
} // namespace } // namespace
class SpaceToDepthOpTest : public OpsTestBase {}; class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 4, 4}, RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 1, 2, 16},
{1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4}, RunDepthToSpace<DeviceType::GPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 1, 2, 16},
{1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 2, 4}, RunDepthToSpace<DeviceType::CPU>(
{1, 2, 3, 4, 5, 6, 7, 8, false, {1, 2, 2, 4},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4}, RunDepthToSpace<DeviceType::GPU>(
{1, 2, 3, 4, 5, 6, 7, 8, false, {1, 2, 2, 4},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
class DepthToSpaceOpTest : public OpsTestBase {}; class DepthToSpaceOpTest : public OpsTestBase {};
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 2, 16}, RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 2, 4, 4},
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16}, RunDepthToSpace<DeviceType::GPU>(
true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2, {1, 2, 4, 4},
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 1, 16}, RunDepthToSpace<DeviceType::CPU>(
{1, 2, 3, 4, 5, 6, 7, 8, true, {1, 1, 1, 16},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16}, RunDepthToSpace<DeviceType::GPU>(
{1, 2, 3, 4, 5, 6, 7, 8, true, {1, 1, 1, 16},
9, 10, 11, 12, 13, 14, 15, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0); const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, in, 2,
in, {1, 384, 384, 32}, in);
2,
{1, 384, 384, 32},
in);
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const bool d2s, const int block_size, void RandomTest(const bool d2s,
const int block_size,
const std::vector<index_t> &shape) { const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
srand(time(NULL)); srand(time(NULL));
...@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size, ...@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size,
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", shape); net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW") .Input("InputNCHW")
...@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size, ...@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size,
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg", BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { ...@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
} }
} // namespace test } // namespace test
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {}; class DepthwiseConv2dOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void SimpleValidTest() { void SimpleValidTest() {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
// Construct graph // Construct graph
...@@ -36,9 +36,7 @@ void SimpleValidTest() { ...@@ -36,9 +36,7 @@ void SimpleValidTest() {
"Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f}); "Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}); net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -51,10 +49,8 @@ void SimpleValidTest() { ...@@ -51,10 +49,8 @@ void SimpleValidTest() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -85,8 +81,8 @@ void SimpleValidTest() { ...@@ -85,8 +81,8 @@ void SimpleValidTest() {
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f, {1, 2, 2, 2},
67.1f, 268.2f, 77.1f, 308.2f}); {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
...@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { ...@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
} }
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void ComplexValidTest(index_t batch, index_t channel, index_t height, void ComplexValidTest(index_t batch,
index_t width, index_t kernel, index_t multiplier, index_t channel,
index_t height,
index_t width,
index_t kernel,
index_t multiplier,
int stride) { int stride) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
// Construct graph // Construct graph
...@@ -125,18 +125,14 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -125,18 +125,14 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
std::vector<float> filter_data(kernel * kernel * channel * multiplier); std::vector<float> filter_data(kernel * kernel * channel * multiplier);
GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
&filter_data); &filter_data);
net.AddInputFromArray<D, float>("Filter", net.AddInputFromArray<D, float>(
{multiplier, channel, kernel, kernel}, "Filter", {multiplier, channel, kernel, kernel}, filter_data);
filter_data);
std::vector<float> bias_data(channel * multiplier); std::vector<float> bias_data(channel * multiplier);
GenerateRandomRealTypeData({channel * multiplier}, &bias_data); GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, bias_data);
bias_data);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -150,10 +146,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -150,10 +146,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, ...@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
} }
} }
auto expected = CreateTensor<T>( auto expected =
{1, out_height, out_width, out_channels}, expect); CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { ...@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
} }
namespace { namespace {
template<typename T> template <typename T>
void TestNxNS12(const index_t height, const index_t width) { void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
...@@ -263,18 +257,14 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -263,18 +257,14 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>(
{batch, height, width, "Input", {batch, height, width, input_channels});
input_channels});
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w}); "Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::GPU, float>("Bias", net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier {multiplier * input_channels});
* input_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -290,10 +280,8 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -290,10 +280,8 @@ void TestNxNS12(const index_t height, const index_t width) {
// Run on cpu // Run on cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
NCHW, "Output", NHWC);
"Output",
NHWC);
// Check // Check
Tensor expected; Tensor expected;
...@@ -319,18 +307,16 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -319,18 +307,16 @@ void TestNxNS12(const index_t height, const index_t width) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
// Transfer output // Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
"OutputImage",
"DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-5, 1e-4); 1e-4);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
1e-2, 1e-2); 1e-2);
} }
}; };
...@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) {
} }
} // namespace } // namespace
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TestNxNS12<float>(4, 4); }
TestNxNS12<float>(4, 4);
}
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) { TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12<half>(4, 4); TestNxNS12<half>(4, 4);
......
...@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> { ...@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> {
public: public:
EltwiseOp(const OperatorDef &op_def, Workspace *ws) EltwiseOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(static_cast<kernels::EltwiseType>( functor_(
OperatorBase::GetOptionalArg<int>( static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))), "type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"), OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("x", 1.0)) {} OperatorBase::GetOptionalArg<float>("x", 1.0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor* input0 = this->Input(0); const Tensor *input0 = this->Input(0);
const Tensor* input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
return functor_(input0, input1, output, future); return functor_(input0, input1, output, future);
} }
......
此差异已折叠。
...@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> { ...@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
offset->dim_size()); offset->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, nullptr, nullptr, 0, output, future); return functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
} }
......
...@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma, ...@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
} }
} }
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -83,10 +83,9 @@ void Simple() { ...@@ -83,10 +83,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
0.8291, 0.8291, 3.1708, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
} }
...@@ -108,14 +107,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -108,14 +107,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -128,9 +125,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -128,9 +125,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -173,14 +168,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -173,14 +168,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -193,9 +186,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -193,9 +186,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -239,14 +230,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -239,14 +230,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -259,9 +248,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -259,9 +248,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -303,14 +290,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -303,14 +290,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -323,9 +308,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -323,9 +308,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<DeviceType D, class T> template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> { class FullyConnectedOp : public Operator<D, T> {
public: public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
...@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> { ...@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
MACE_CHECK(input->dim(1) == weight->dim(1) MACE_CHECK(
&& input->dim(2) == weight->dim(2) input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
&& input->dim(3) == weight->dim(3) input->dim(3) == weight->dim(3) && weight->dim(0) == bias->dim(0),
&& weight->dim(0) == bias->dim(0), "The shape of Input: ", MakeString(input->shape()),
"The shape of Input: ", "The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
MakeString(input->shape()), bias->dim(0), " don't match.");
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
} else { } else {
MACE_CHECK(input->dim(1) == weight->dim(2) MACE_CHECK(
&& input->dim(2) == weight->dim(3) input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) &&
&& input->dim(3) == weight->dim(1) input->dim(3) == weight->dim(1) && weight->dim(0) == bias->dim(0),
&& weight->dim(0) == bias->dim(0), "The shape of Input: ", MakeString(input->shape()),
"The shape of Input: ", "The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
MakeString(input->shape()), bias->dim(0), " don't match.");
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
} }
return functor_(input, weight, bias, output, future); return functor_(input, weight, bias, output, future);
......
...@@ -24,7 +24,7 @@ namespace test { ...@@ -24,7 +24,7 @@ namespace test {
class FullyConnectedOpTest : public OpsTestBase {}; class FullyConnectedOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple(const std::vector<index_t> &input_shape, void Simple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_value, const std::vector<float> &input_value,
const std::vector<index_t> &weight_shape, const std::vector<index_t> &weight_shape,
...@@ -111,8 +111,8 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) { ...@@ -111,8 +111,8 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::GPU>( Simple<DeviceType::GPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2},
{1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6, {1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2,
10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6}, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96}); {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
} }
...@@ -122,7 +122,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { ...@@ -122,7 +122,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
} }
namespace { namespace {
template<typename T> template <typename T>
void Random(const index_t batch, void Random(const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
...@@ -134,15 +134,13 @@ void Random(const index_t batch, ...@@ -134,15 +134,13 @@ void Random(const index_t batch,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, channels, height, width}); "Weight", {out_channel, channels, height, width});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}); net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest") OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -182,11 +180,11 @@ void Random(const index_t batch, ...@@ -182,11 +180,11 @@ void Random(const index_t batch,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1,
1e-1, 1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-3); 1e-3);
} }
} }
} // namespace } // namespace
......
...@@ -25,8 +25,7 @@ template <DeviceType D, class T> ...@@ -25,8 +25,7 @@ template <DeviceType D, class T>
class LocalResponseNormOp : public Operator<D, T> { class LocalResponseNormOp : public Operator<D, T> {
public: public:
LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws) LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws), functor_() {
functor_() {
depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5); depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f); bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f); alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
...@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> { ...@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> {
input->dim_size()); input->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future); return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
} }
......
...@@ -21,7 +21,7 @@ namespace test { ...@@ -21,7 +21,7 @@ namespace test {
class LocalResponseNormOpTest : public OpsTestBase {}; class LocalResponseNormOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -46,9 +46,9 @@ void Simple() { ...@@ -46,9 +46,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 1, 2, 6}, {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, {1, 1, 2, 6},
0.34, 0.34, 0.40, 0.40, 0.47, 0.47}); {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0, 1e-2);
} }
......
...@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) { ...@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
TEST_F(MatMulOpTest, SimpleOPENCL) { TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1}, Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
{22, 28, 49, 64});
Simple<DeviceType::GPU>( Simple<DeviceType::GPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
...@@ -127,10 +126,9 @@ void Complex(const index_t batch, ...@@ -127,10 +126,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("A", net.AddRandomInput<DeviceType::GPU, float>("A", {batch, height, channels, 1});
{batch, height, channels, 1}); net.AddRandomInput<DeviceType::GPU, float>("B",
net.AddRandomInput<DeviceType::GPU, float>( {batch, channels, out_width, 1});
"B", {batch, channels, out_width, 1});
// run cpu // run cpu
net.RunOp(); net.RunOp();
...@@ -158,11 +156,11 @@ void Complex(const index_t batch, ...@@ -158,11 +156,11 @@ void Complex(const index_t batch,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT); kernels::BufferType::IN_OUT_HEIGHT);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5,
1e-5, 1e-5); 1e-5);
} }
} }
} // namespace } // namespace
......
此差异已折叠。
...@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> { ...@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> {
PadOp(const OperatorDef &operator_def, Workspace *ws) PadOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_(OperatorBase::GetRepeatedArgs<int>("paddings"), functor_(OperatorBase::GetRepeatedArgs<int>("paddings"),
OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
{}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -26,8 +26,7 @@ class ResizeBilinearOp : public Operator<D, T> { ...@@ -26,8 +26,7 @@ class ResizeBilinearOp : public Operator<D, T> {
public: public:
ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws) ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_( functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {} OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册