提交 2940ce5d 编写于 作者: 吴承辉

Merge branch 'mace-runtime-status' into 'master'

Return mace status for allocate

See merge request !537
...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
} }
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(ws_->LoadModelTensor( MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
*net_def, device_type_, model_data)); *net_def, device_type_, model_data));
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
NetMode::INIT); NetMode::INIT);
MACE_FAILURE_RETURN(net->Run()); MACE_RETURN_IF_ERROR(net->Run());
net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions"); " please use 1 to fill missing dimensions");
Tensor *input_tensor = Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first)); ws_->GetTensor(MakeString("mace_input_node_", input.first));
input_tensor->Resize(input.second.shape()); MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{ {
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>(); float *input_data = input_tensor->mutable_data<float>();
...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]); hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]);
} else { } else {
#endif #endif
MACE_FAILURE_RETURN(net_->Run(run_metadata)); MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
#endif #endif
......
...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
CallStats call_stats; CallStats call_stats;
if (future_wait) { if (future_wait) {
StatsFuture future; StatsFuture future;
MACE_FAILURE_RETURN(op->Run(&future)); MACE_RETURN_IF_ERROR(op->Run(&future));
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
future.wait_fn(&call_stats); future.wait_fn(&call_stats);
} else { } else {
...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} }
} else if (run_metadata != nullptr) { } else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros(); call_stats.start_micros = NowMicros();
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
call_stats.end_micros = NowMicros(); call_stats.end_micros = NowMicros();
} else { } else {
MACE_FAILURE_RETURN(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
} }
if (run_metadata != nullptr) { if (run_metadata != nullptr) {
......
...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} else { } else {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type))); new Buffer(GetDeviceAllocator(type)));
MaceStatus status = tensor_buffer_->Allocate(model_data_size); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
tensor_buffer_->Map(nullptr); tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data), tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size); 0, model_data_size);
...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() >= 20000) { if (mem_block.mem_id() >= 20000) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image()); new Image());
MaceStatus status = image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype); {mem_block.x(), mem_block.y()}, dtype));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} }
...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() < 20000) { if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type))); new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE); + MACE_EXTRA_BUFFER_PAD_SIZE));
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
......
...@@ -40,7 +40,7 @@ struct AddNFunctor { ...@@ -40,7 +40,7 @@ struct AddNFunctor {
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensors[0])); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size(); index_t size = output_tensor->size();
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -21,73 +21,73 @@ namespace mace { ...@@ -21,73 +21,73 @@ namespace mace {
namespace kernels { namespace kernels {
void Conv2dNeonK1x1S1(const float *input, void Conv2dNeonK1x1S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
const index_t in_channels, const index_t in_channels,
const index_t out_channels, const index_t out_channels,
float *output); float *output);
void Conv2dNeonK3x3S1(const float *input, void Conv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK3x3S2(const float *input, void Conv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK5x5S1(const float *input, void Conv2dNeonK5x5S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK1x7S1(const float *input, void Conv2dNeonK1x7S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x1S1(const float *input, void Conv2dNeonK7x1S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S1(const float *input, void Conv2dNeonK7x7S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S2(const float *input, void Conv2dNeonK7x7S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK7x7S3(const float *input, void Conv2dNeonK7x7S3(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK1x15S1(const float *input, void Conv2dNeonK1x15S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
void Conv2dNeonK15x1S1(const float *input, void Conv2dNeonK15x1S1(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
float *output); float *output);
// calculate one output channel and one input channel // calculate one output channel and one input channel
inline void Conv2dCPUKHxKWCalc(const float *in_ptr, inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
...@@ -99,13 +99,13 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr, ...@@ -99,13 +99,13 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
const index_t out_width, const index_t out_width,
float *out_ptr, float *out_ptr,
const int stride) { const int stride) {
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
for (int i = 0; i < filter_height; ++i) { for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) { for (int j = 0; j < filter_width; ++j) {
out_ptr[h * out_width + w] out_ptr[h * out_width + w] +=
+= in_ptr[(h * stride + i) * in_width + (w * stride + j)] in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
* filter_ptr[i * filter_width + j]; filter_ptr[i * filter_width + j];
} }
} }
} }
......
...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr, ...@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr,
for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
for (int i = 0; i < 15; ++i) { for (int i = 0; i < 15; ++i) {
for (int j = 0; j < 1; ++j) { for (int j = 0; j < 1; ++j) {
out_ptr[io * out_image_size + ih * out_width + w + iw] out_ptr[io * out_image_size + ih * out_width + w + iw] +=
+= in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 1 + j]; filter_ptr[io * in_channels * 15 + i * 1 + j];
} }
} }
} }
} }
} }
// Ho = 4, Wo = 1, Co = 1 // Ho = 4, Wo = 1, Co = 1
void Conv2dNeonK15x1S1(const float *input, void Conv2dNeonK15x1S1(const float *input,
const float *filter, const float *filter,
...@@ -59,7 +58,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -59,7 +58,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
const index_t tile_width = const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
...@@ -147,16 +145,16 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -147,16 +145,16 @@ void Conv2dNeonK15x1S1(const float *input,
out_ptr_base[out_offset + 2 * out_width] = vo[2]; out_ptr_base[out_offset + 2 * out_width] = vo[2];
out_ptr_base[out_offset + 3 * out_width] = vo[3]; out_ptr_base[out_offset + 3 * out_width] = vo[3];
} // wt } // wt
} // h } // h
#else #else
Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels, Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, out_width, w, tile_width, out_height, out_width, w, tile_width,
out_image_size, out_ptr_base, 0, 1); out_image_size, out_ptr_base, 0, 1);
#endif #endif
} // c } // c
} // w } // w
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input, ...@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input,
const index_t out_channels, const index_t out_channels,
float *output) { float *output) {
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
Gemm(filter, Gemm(filter, input + b * in_channels * height * width, 1, out_channels,
input + b * in_channels * height * width, in_channels, height * width,
1,
out_channels,
in_channels,
height * width,
output + b * out_channels * height * width); output + b * out_channels * height * width);
} }
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#endif #endif
#include "mace/kernels/arm/conv_2d_neon.h" #include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr, ...@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr,
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 15; ++j) { for (int j = 0; j < 15; ++j) {
out_ptr[io * out_image_size + (h + ih) * out_width + iw] out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
+= in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
* filter_ptr[io * in_channels * 15 + i * 15 + j]; filter_ptr[io * in_channels * 15 + i * 15 + j];
} }
} }
} }
} }
} }
// Ho = 1, Wo = 4, Co = 1 // Ho = 1, Wo = 4, Co = 1
void Conv2dNeonK1x15S1(const float *input, void Conv2dNeonK1x15S1(const float *input,
const float *filter, const float *filter,
...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
float *out_ptr_base = float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
...@@ -133,16 +131,16 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -133,16 +131,16 @@ void Conv2dNeonK1x15S1(const float *input,
vst1q_f32(out_ptr_base + out_offset, vo); vst1q_f32(out_ptr_base + out_offset, vo);
} // w } // w
} // ht } // ht
#else #else
Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels, Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, h, tile_height, out_width, out_height, h, tile_height, out_width,
out_image_size, out_ptr_base, 0, 1); out_image_size, out_ptr_base, 0, 1);
#endif #endif
} // c } // c
} // h } // h
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */ /* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -174,7 +170,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -174,7 +170,7 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr2_base + out_offset, vo2); vst1q_f32(out_ptr2_base + out_offset, vo2);
vst1q_f32(out_ptr3_base + out_offset, vo3); vst1q_f32(out_ptr3_base + out_offset, vo3);
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
...@@ -239,17 +235,16 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -239,17 +235,16 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
in_width, 1, 7, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -45,7 +45,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -45,7 +45,7 @@ void Conv2dNeonK3x3S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
#endif #endif
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr0 = input + b * in_batch_size + c * in_image_size; const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
...@@ -54,11 +54,11 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -54,11 +54,11 @@ void Conv2dNeonK3x3S1(const float *input,
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1 = out_ptr1_base; float *out_ptr1 = out_ptr1_base;
const float *in_ptr1 = const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width; input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 = const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width; input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 = const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width; input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9; const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
#endif #endif
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf11 = vld1q_f32(filter_ptr1 + 3); vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6); vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -179,7 +178,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -179,7 +178,7 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width; out_ptr0 += out_width;
out_ptr1 += out_width; out_ptr1 += out_width;
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf167 = vld1_f32(filter_ptr1 + 6); vf167 = vld1_f32(filter_ptr1 + 6);
vf189 = vld1_f32(filter_ptr1 + 8); vf189 = vld1_f32(filter_ptr1 + 8);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
...@@ -313,18 +311,18 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -313,18 +311,18 @@ void Conv2dNeonK3x3S1(const float *input,
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < out_channels; ++mm) {
float float *out_ptr0_base =
*out_ptr0_base = output + b * out_batch_size + mm * out_image_size; output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float const float *in_ptr0 =
*in_ptr0 = input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *in_ptr1 = const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width; input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 = const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width; input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 = const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width; input + b * in_batch_size + c * in_image_size + 3 * in_width;
#endif #endif
const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9; const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -411,7 +408,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -411,7 +408,7 @@ void Conv2dNeonK3x3S1(const float *input,
in_ptr3 += 2 + in_width; in_ptr3 += 2 + in_width;
out_ptr0 += out_width; out_ptr0 += out_width;
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
in_ptr2 += 4; in_ptr2 += 4;
...@@ -499,15 +495,14 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -499,15 +495,14 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width; out_ptr0 += out_width;
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
void Conv2dNeonK3x3S2(const float *input, void Conv2dNeonK3x3S2(const float *input,
...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t out_height = out_shape[2]; const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
*filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size; float *out_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...@@ -569,8 +563,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -569,8 +563,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -591,8 +585,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -591,8 +585,8 @@ void Conv2dNeonK3x3S2(const float *input,
vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
vst1q_f32(out_base + out_offset, vo); vst1q_f32(out_base + out_offset, vo);
} // w } // w
} // h } // h
#elif defined(MACE_ENABLE_NEON) // arm v7 #elif defined(MACE_ENABLE_NEON) // arm v7
// load filter (1 outch x 3 height x 3 width): vf_outch_height // load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x2_t vf01, vf23, vf45, vf67, vf78; float32x2_t vf01, vf23, vf45, vf67, vf78;
...@@ -631,8 +625,8 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -631,8 +625,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -654,15 +648,14 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -654,15 +648,14 @@ void Conv2dNeonK3x3S2(const float *input,
vst1q_f32(out_base + out_offset, vo); vst1q_f32(out_base + out_offset, vo);
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_base, filter_ptr, Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
in_width, 3, 3, out_height, out_width, out_width, out_base, 2);
out_base, 2);
#endif #endif
} // c } // c
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -21,59 +21,59 @@ ...@@ -21,59 +21,59 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \ #define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \ float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \ float32x2_t vf01, vf11, vf21, vf31; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \ vf01 = vld1_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \ vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1_f32(filter_ptr1 + 3); \ vf11 = vld1_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \ vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1_f32(filter_ptr2 + 3); \ vf21 = vld1_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \ vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1_f32(filter_ptr3 + 3); \ vf31 = vld1_f32(filter_ptr3 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \
\ \
/* outch 1 */ \ /* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \ vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \
\ \
/* outch 2 */ \ /* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \ vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \
\ \
/* outch 3 */ \ /* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define MACE_Conv2dNeonK5x5SnLoadCalc1 \ #define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \ float32x4_t vf00; \
float32x2_t vf01; \ float32x2_t vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \ vf01 = vld1_f32(filter_ptr0 + 3); \
\ \
/* outch 0 */ \ /* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
...@@ -99,7 +99,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -99,7 +99,7 @@ void Conv2dNeonK5x5S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size; output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
...@@ -118,7 +118,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -118,7 +118,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter + (m + 3) * in_channels * 25 + c * 25; filter + (m + 3) * in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
...@@ -157,7 +157,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -157,7 +157,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter_ptr2 -= 25; filter_ptr2 -= 25;
filter_ptr3 -= 25; filter_ptr3 -= 25;
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
...@@ -203,17 +203,16 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -203,17 +203,16 @@ void Conv2dNeonK5x5S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0); vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 25; filter_ptr0 -= 25;
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
in_width, 5, 5, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} // mm } // mm
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float *out_ptr1_base = float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size; output + b * out_batch_size + (m + 1) * out_image_size;
...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input,
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
filter + (m + 1) * in_channels * 7 + c * 7; const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr2 = const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 4 height x 1 width) */ /* load filter (4 outch x 4 height x 1 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width], out_ptr3_base[out_offset + 2 * out_width],
out_ptr3_base[out_offset + 3 * out_width]}; out_ptr3_base[out_offset + 3 * out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * in_width + w;
// input (3 slide) // input (3 slide)
...@@ -203,7 +198,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -203,7 +198,7 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
} // w } // w
} // h } // h
#else #else
for (index_t oc = 0; oc < 4; ++oc) { for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
...@@ -280,17 +275,16 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -280,17 +275,16 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
} // w } // w
} // h } // h
#else #else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
in_width, 7, 1, out_height, out_width, out_height, out_width, out_ptr0_base, 1);
out_ptr0_base, 1);
#endif #endif
} // c } // c
} }
} // if } // if
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/kernels/arm/conv_winograd.h" #include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -44,14 +44,13 @@ void TransformInput4x4(const float *input, ...@@ -44,14 +44,13 @@ void TransformInput4x4(const float *input,
for (index_t h = 0; h < in_height - 2; h += 2) { for (index_t h = 0; h < in_height - 2; h += 2) {
for (index_t w = 0; w < in_width - 2; w += 2) { for (index_t w = 0; w < in_width - 2; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15; d15;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15; s15;
// load tile data // load tile data
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
d0 = input_ptr[0]; d0 = input_ptr[0];
d1 = input_ptr[1]; d1 = input_ptr[1];
d2 = input_ptr[2]; d2 = input_ptr[2];
...@@ -92,7 +91,7 @@ void TransformInput4x4(const float *input, ...@@ -92,7 +91,7 @@ void TransformInput4x4(const float *input,
// store output // store output
float *output_ptr = float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index; output + n * output_batch_size + c * tile_count + tile_index;
output_ptr[0] = s0; output_ptr[0] = s0;
output_ptr[1 * stride] = s1; output_ptr[1 * stride] = s1;
output_ptr[2 * stride] = s2; output_ptr[2 * stride] = s2;
...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input, ...@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input,
float s[8][8]; float s[8][8];
for (index_t h = 0; h < in_height - 2; h += 6) { for (index_t h = 0; h < in_height - 2; h += 6) {
for (index_t w = 0; w < in_width - 2; w += 6) { for (index_t w = 0; w < in_width - 2; w += 6) {
const float *input_ptr = const float *input_ptr = input + n * input_batch_size +
input + n * input_batch_size + c * in_height_width + h * in_width c * in_height_width + h * in_width + w;
+ w;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -203,7 +201,7 @@ void TransformInput8x8(const float *input, ...@@ -203,7 +201,7 @@ void TransformInput8x8(const float *input,
} }
float *output_ptr = float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index; output + n * output_batch_size + c * tile_count + tile_index;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
d0 = s[0][i]; d0 = s[0][i];
...@@ -258,27 +256,18 @@ void BatchGemm(const float *input, ...@@ -258,27 +256,18 @@ void BatchGemm(const float *input,
const index_t out_stride = out_channels * tile_count; const index_t out_stride = out_channels * tile_count;
if (batch == 1) { if (batch == 1) {
Gemm(filter, Gemm(filter, input, in_tile_area, out_channels, in_channels, tile_count,
input,
in_tile_area,
out_channels,
in_channels,
tile_count,
output); output);
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int i = 0; i < in_tile_area; ++i) { for (int i = 0; i < in_tile_area; ++i) {
const float const float *in_ptr = input + b * in_batch_size + i * in_stride;
*in_ptr = input + b * in_batch_size + i * in_stride;
const float *filter_ptr = filter + i * filter_stride; const float *filter_ptr = filter + i * filter_stride;
float *out_ptr = output + b * out_batch_size + i * out_stride; float *out_ptr = output + b * out_batch_size + i * out_stride;
Gemm(filter_ptr, Gemm(filter_ptr, in_ptr, 1, out_channels, /* rows */
in_ptr, in_channels, /* K */
1, tile_count, /* cols */
out_channels, /* rows */
in_channels, /* K */
tile_count, /* cols */
out_ptr); out_ptr);
} }
} }
...@@ -305,12 +294,12 @@ void TransformOutput4x4(const float *input, ...@@ -305,12 +294,12 @@ void TransformOutput4x4(const float *input,
for (index_t h = 0; h < out_height; h += 2) { for (index_t h = 0; h < out_height; h += 2) {
for (index_t w = 0; w < out_width; w += 2) { for (index_t w = 0; w < out_width; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15; d15;
float s0, s1, s2, s3, s4, s5, s6, s7; float s0, s1, s2, s3, s4, s5, s6, s7;
float v0, v1, v2, v3; float v0, v1, v2, v3;
const float *input_ptr = const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset; input + n * input_batch_size + m * tile_count + tile_offset;
d0 = input_ptr[0]; d0 = input_ptr[0];
d1 = input_ptr[1 * stride]; d1 = input_ptr[1 * stride];
d2 = input_ptr[2 * stride]; d2 = input_ptr[2 * stride];
...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input, ...@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input,
v2 = s2 - s4 - s6; v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7; v3 = s3 - s5 - s7;
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
output_ptr[0] = v0; output_ptr[0] = v0;
output_ptr[1] = v1; output_ptr[1] = v1;
output_ptr[out_width] = v2; output_ptr[out_width] = v2;
...@@ -403,7 +391,7 @@ void TransformOutput8x8(const float *input, ...@@ -403,7 +391,7 @@ void TransformOutput8x8(const float *input,
for (index_t h = 0; h < out_height; h += 6) { for (index_t h = 0; h < out_height; h += 6) {
for (index_t w = 0; w < out_width; w += 6) { for (index_t w = 0; w < out_width; w += 6) {
const float *input_ptr = const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset; input + n * input_batch_size + m * tile_count + tile_offset;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input, ...@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input,
input_ptr += 8 * stride; input_ptr += 8 * stride;
} }
float *output_ptr = float *output_ptr = output + n * output_batch_size +
output + n * output_batch_size + m * out_image_size + h * out_width m * out_image_size + h * out_width + w;
+ w;
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7; float d0, d1, d2, d3, d4, d5, d6, d7;
...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input, ...@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input,
} }
} // namespace } // namespace
// OCHW => TOC // OCHW => TOC
// no need to optimize, it will exist in converter // no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter, void TransformFilter4x4(const float *filter,
...@@ -485,7 +471,7 @@ void TransformFilter4x4(const float *filter, ...@@ -485,7 +471,7 @@ void TransformFilter4x4(const float *filter,
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8; float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15; s15;
// load filter // load filter
index_t filter_offset = (m * in_channels + c) * 9; index_t filter_offset = (m * in_channels + c) * 9;
...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter, ...@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter,
float *output) { float *output) {
const index_t stride = out_channels * in_channels; const index_t stride = out_channels * in_channels;
const float G[8][3] = { const float G[8][3] = {{1.0f, 0.0f, 0.0f},
{1.0f, 0.0f, 0.0f}, {-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, -2.0f / 9, -2.0f / 9}, {-2.0f / 9, 2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9}, {1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, 1.0f / 45, 2.0f / 45}, {1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180}, {0.0f, 0.0f, 1.0f}};
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
...@@ -612,7 +596,7 @@ void TransformFilter8x8(const float *filter, ...@@ -612,7 +596,7 @@ void TransformFilter8x8(const float *filter,
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) { for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] = output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
} }
} }
} }
...@@ -633,62 +617,38 @@ void WinoGradConv3x3s1(const float *input, ...@@ -633,62 +617,38 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
index_t tile_height_count = index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count = index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count; index_t tile_count = tile_height_count * tile_width_count;
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformInput4x4(input, TransformInput4x4(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
case 6: case 6:
TransformInput8x8(input, TransformInput8x8(input, batch, in_height, in_width, in_channels,
batch, tile_count, transformed_input);
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
BatchGemm(transformed_input, BatchGemm(transformed_input, transformed_filter, batch, in_channels,
transformed_filter, out_channels, tile_count, out_tile_size, transformed_output);
batch,
in_channels,
out_channels,
tile_count,
out_tile_size,
transformed_output);
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformOutput4x4(transformed_output, TransformOutput4x4(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
case 6: case 6:
TransformOutput8x8(transformed_output, TransformOutput8x8(transformed_output, batch, out_height, out_width,
batch, out_channels, tile_count, output);
out_height,
out_width,
out_channels,
tile_count,
output);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
} }
...@@ -704,52 +664,39 @@ void WinoGradConv3x3s1(const float *input, ...@@ -704,52 +664,39 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
index_t tile_height_count = index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count = index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size)); RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count; index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
index_t transformed_input_size = index_t transformed_input_size =
in_tile_area * batch * in_channels * tile_count; in_tile_area * batch * in_channels * tile_count;
index_t transformed_filter_size = in_tile_area * out_channels * in_channels; index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
index_t index_t transformed_output_size =
transformed_output_size = in_tile_area * batch * out_channels * tile_count; in_tile_area * batch * out_channels * tile_count;
float *transformed_input = new float[transformed_input_size]; // TNCB float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_filter = new float[transformed_filter_size]; // TOC float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size]; float *transformed_output = new float[transformed_output_size];
switch (out_tile_size) { switch (out_tile_size) {
case 2: case 2:
TransformFilter4x4(filter, TransformFilter4x4(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
case 6: case 6:
TransformFilter8x8(filter, TransformFilter8x8(filter, in_channels, out_channels, transformed_filter);
in_channels,
out_channels,
transformed_filter);
break; break;
default:MACE_NOT_IMPLEMENTED; default:
MACE_NOT_IMPLEMENTED;
} }
WinoGradConv3x3s1(input, WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width,
transformed_filter, in_channels, out_channels, out_tile_size, transformed_input,
batch, transformed_output, output);
in_height,
in_width, delete[] transformed_input;
in_channels, delete[] transformed_filter;
out_channels, delete[] transformed_output;
out_tile_size,
transformed_input,
transformed_output,
output);
delete[]transformed_input;
delete[]transformed_filter;
delete[]transformed_output;
} }
void ConvRef3x3s1(const float *input, void ConvRef3x3s1(const float *input,
...@@ -769,7 +716,7 @@ void ConvRef3x3s1(const float *input, ...@@ -769,7 +716,7 @@ void ConvRef3x3s1(const float *input,
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
index_t out_offset = index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w; ((b * out_channels + m) * out_height + h) * out_width + w;
output[out_offset] = 0; output[out_offset] = 0;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
for (index_t kh = 0; kh < 3; ++kh) { for (index_t kh = 0; kh < 3; ++kh) {
...@@ -777,11 +724,10 @@ void ConvRef3x3s1(const float *input, ...@@ -777,11 +724,10 @@ void ConvRef3x3s1(const float *input,
index_t ih = h + kh; index_t ih = h + kh;
index_t iw = w + kw; index_t iw = w + kw;
index_t in_offset = index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw; ((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t index_t filter_offset =
filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; (((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] += output[out_offset] += input[in_offset] * filter[filter_offset];
input[in_offset] * filter[filter_offset];
} }
} }
} }
......
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <random>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <random>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/arm/conv_winograd.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) { ...@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
std::generate(input_data, input_data + input_size, std::generate(input_data, input_data + input_size, [&gen, &nd] {
[&gen, &nd] { return std::max(-1.0f, std::min(1.0f, nd(gen)));
return std::max(-1.0f, std::min(1.0f, nd(gen))); });
}); std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
std::generate(filter_data, filter_data + filter_size, return std::max(-1.0f, std::min(1.0f, nd(gen)));
[&gen, &nd] { });
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
kernels::ConvRef3x3s1(input_data, kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
filter_data, in_channels, out_channels, output_data_ref);
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data_ref);
kernels::WinoGradConv3x3s1(input_data, kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
filter_data, in_width, in_channels, out_channels, 6,
batch,
in_height,
in_width,
in_channels,
out_channels,
6,
output_data); output_data);
// test // test
......
...@@ -32,15 +32,15 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -32,15 +32,15 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
float *output); float *output);
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
const int *pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
const index_t valid_w_stop, const index_t valid_w_stop,
float *output); float *output);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, ...@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1 // Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w, DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
h - pad_top, w - pad_left, out_width, in_height, in_width, 3,
w - pad_left, 3, out_base);
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_shape[3]; ++iw) { for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih - pad_top,
filter_ptr, iw - pad_left, out_width, in_height, in_width, 3,
ih, 3, out_base);
iw,
ih - pad_top,
iw - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -275,29 +221,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -275,29 +221,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
filter_ptr, w - pad_left, out_width, in_height, in_width, 3,
h, 3, out_base);
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
} // b } // b
} }
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t* in_shape, const index_t *in_shape,
const index_t* out_shape, const index_t *out_shape,
const int* pad_hw, const int *pad_hw,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
for (h = valid_h_start; h < valid_h_stop; ++h) { for (h = valid_h_start; h < valid_h_stop; ++h) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
...@@ -397,8 +316,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -397,8 +316,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w; index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7] vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0]; vi10 = vi1.val[0];
vi11 = vi1.val[1]; vi11 = vi1.val[1];
...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// right // right
for (; w < out_width; ++w) { for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih * 2 - pad_top,
filter_ptr, iw * 2 - pad_left, out_width, in_height,
ih, in_width, 3, 3, out_base);
iw,
ih * 2 - pad_top,
iw * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
#endif #endif
...@@ -471,22 +372,13 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -471,22 +372,13 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// bottom // bottom
for (; h < out_shape[2]; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
filter_ptr, w * 2 - pad_left, out_width, in_height, in_width,
h, 3, 3, out_base);
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
} }
} }
} // m } // m
} // b } // b
} }
} // namespace kernels } // namespace kernels
......
...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor { ...@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
outer_sizes[i] = input->size() / inner_size; outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t channels = output->dim(1);
...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if (is_filter_transformed_) { if (is_filter_transformed_) {
transformed_filter_ptr = filter_data; transformed_filter_ptr = filter_data;
} else { } else {
MACE_FAILURE_RETURN(transformed_filter_.Resize( MACE_RETURN_IF_ERROR(transformed_filter_.Resize(
transformed_filter_shape)); transformed_filter_shape));
switch (winograd_out_tile_size) { switch (winograd_out_tile_size) {
case 2: case 2:
...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const Tensor *pad_input_ptr = input; const Tensor *pad_input_ptr = input;
if (extra_input_height != input_height if (extra_input_height != input_height
|| extra_input_width != input_width) { || extra_input_width != input_width) {
MACE_FAILURE_RETURN(ConstructNCHWInputWithSpecificPadding(input, MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input,
pad_top, pad_top,
pad_bottom, pad_bottom,
pad_left, pad_left,
......
...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, ...@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, ...@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const int pad_width = pad_left + pad_right; const int pad_width = pad_left + pad_right;
std::vector<index_t> output_shape( std::vector<index_t> output_shape(
{batch, channels, height + pad_height, width + pad_width}); {batch, channels, height + pad_height, width + pad_width});
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
output_tensor->Clear(); output_tensor->Clear();
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, ...@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2; const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2; const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor); Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>(); float *output_data = output_tensor->mutable_data<float>();
......
...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, padding_type_, strides_, padding_type_,
output_shape.data(), output_shape.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, strides_,
output_shape_.data(), output_shape_.data(),
paddings_.data(), true); paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape_)); MACE_RETURN_IF_ERROR(output->Resize(output_shape_));
} }
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
......
...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor { ...@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor {
std::vector<index_t> output_shape = {batch_size, output_depth, std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width}; output_height, output_width};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
RoundType::FLOOR, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
output->Clear(); output->Clear();
index_t batch = output->dim(0); index_t batch = output->dim(0);
......
...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase { ...@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
} }
} }
} }
MACE_FAILURE_RETURN(output->ResizeLike(input0)); MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0); Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
......
...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { ...@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1}; std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
MACE_FAILURE_RETURN(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t N = output->dim(0); const index_t N = output->dim(0);
const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
const index_t output_size = weight->dim(0); const index_t output_size = weight->dim(0);
......
...@@ -44,7 +44,7 @@ struct MatMulFunctor { ...@@ -44,7 +44,7 @@ struct MatMulFunctor {
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
MACE_FAILURE_RETURN(C->Resize(c_shape)); MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A); Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B); Tensor::MappingGuard guardb(B);
......
...@@ -21,12 +21,12 @@ ...@@ -21,12 +21,12 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
T>::operator()(const Tensor *input, const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
const index_t width = input->dim(2); const index_t width = input->dim(2);
...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
switch (activation_) { switch (activation_) {
case RELU:tuning_key_prefix_ = "relu_opencl_kernel"; case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:tuning_key_prefix_ = "relux_opencl_kernel"; case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU:tuning_key_prefix_ = "prelu_opencl_kernel"; case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH:tuning_key_prefix_ = "tanh_opencl_kernel"; case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID:tuning_key_prefix_ = "sigmoid_opencl_kernel"; case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation_; default:
LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, ...@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ActivationFunctor<DeviceType::GPU, float>;
struct ActivationFunctor<DeviceType::GPU, float>; template struct ActivationFunctor<DeviceType::GPU, half>;
template
struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -71,7 +71,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -71,7 +71,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
...@@ -87,13 +87,13 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,13 +87,13 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -23,14 +23,15 @@ namespace mace { ...@@ -23,14 +23,15 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *scale, const Tensor *input,
const Tensor *offset, const Tensor *scale,
const Tensor *mean, const Tensor *offset,
const Tensor *var, const Tensor *mean,
const float epsilon, const Tensor *var,
Tensor *output, const float epsilon,
StatsFuture *future) { Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -96,7 +97,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -96,7 +97,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -23,9 +23,9 @@ namespace kernels { ...@@ -23,9 +23,9 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
const index_t width = input->dim(2); const index_t width = input->dim(2);
...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -67,7 +67,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -67,7 +67,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) { for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
roundup_gws[i] = RoundUp(gws[i], lws[i]);
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
......
...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *image, Tensor *image,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape); CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type); std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
MACE_FAILURE_RETURN(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_FAILURE_RETURN(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
} }
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -107,7 +106,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,7 +106,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -16,18 +16,16 @@ ...@@ -16,18 +16,16 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor *output, MACE_RETURN_IF_ERROR(output->ResizeLike(input));
StatsFuture *future) {
MACE_FAILURE_RETURN(output->ResizeLike(input));
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const index_t channels_per_group = channels / groups_; const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0, MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4"); "channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
"groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ =
built_options); runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -76,7 +73,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -76,7 +73,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
struct ChannelShuffleFunctor<DeviceType::GPU, float>; template struct ChannelShuffleFunctor<DeviceType::GPU, half>;
template
struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,11 +22,9 @@ namespace mace { ...@@ -22,11 +22,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
...@@ -37,16 +35,15 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -37,16 +35,15 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static MaceStatus Concat2(cl::Kernel *kernel,
static void Concat2(cl::Kernel *kernel, const Tensor *input0,
const Tensor *input0, const Tensor *input1,
const Tensor *input1, const DataType dt,
const DataType dt, std::vector<index_t> *prev_input_shape,
std::vector<index_t> *prev_input_shape, Tensor *output,
Tensor *output, StatsFuture *future,
StatsFuture *future, uint32_t *kwg_size,
uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error) {
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -67,8 +64,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -67,8 +64,8 @@ static void Concat2(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -95,7 +92,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -95,7 +92,7 @@ static void Concat2(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,15 +122,17 @@ static void Concat2(cl::Kernel *kernel, ...@@ -125,15 +122,17 @@ static void Concat2(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
static void ConcatN(cl::Kernel *kernel, static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -179,7 +178,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -179,7 +178,7 @@ static void ConcatN(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()( ...@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, return Concat2(&kernel_, input_list[0], input_list[1],
&input_shape_, output, future, &kwg_size_, &kernel_error_); DataTypeToEnum<T>::value, &input_shape_, output, future,
break; &kwg_size_, &kernel_error_);
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future, return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
&kwg_size_, &kernel_error_); future, &kwg_size_, &kernel_error_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -18,61 +18,61 @@ ...@@ -18,61 +18,61 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
template <typename T> template <typename T>
MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)( typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), return conv2d_func(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), return Conv2dOpencl(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} }
return MACE_SUCCESS;
} }
template struct Conv2dFunctor<DeviceType::GPU, float>; template struct Conv2dFunctor<DeviceType::GPU, float>;
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -25,11 +25,9 @@ namespace { ...@@ -25,11 +25,9 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value. // TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128; const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
} }
...@@ -57,21 +54,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -57,21 +54,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_UNUSED(padding); MACE_UNUSED(padding);
MACE_UNUSED(dilations); MACE_UNUSED(dilations);
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -145,7 +142,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -145,7 +142,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -24,22 +24,20 @@ namespace kernels { ...@@ -24,22 +24,20 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>( uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1); OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, const uint32_t base =
4); std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base), lws[0] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size / RoundUp<uint32_t>(
lws_size / compute_units, base), cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
...@@ -50,21 +48,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -50,21 +48,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -129,7 +127,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -129,7 +127,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size, const uint32_t kernel_size,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -41,10 +40,10 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -41,10 +40,10 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units) lws_size / compute_units) *
* 8, 8,
gws[2]); gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
if (gws[2] < lws_limit) { if (gws[2] < lws_limit) {
lws[2] = gws[2]; lws[2] = gws[2];
...@@ -58,21 +57,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -58,21 +57,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern void Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *padding, const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -137,7 +136,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -137,7 +136,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -168,11 +167,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -168,11 +167,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel", output->dim(0), Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3), output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -20,20 +20,20 @@ namespace kernels { ...@@ -20,20 +20,20 @@ namespace kernels {
namespace { namespace {
void Deconv2dOpencl(cl::Kernel *kernel, MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *paddings, const int *paddings,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
#define MACE_WIDTH_BLK 5 #define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride; const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks = const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride; ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride); const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1; const int padding_h = (paddings[0] + 1) >> 1;
const int padding_w = (paddings[0]+1) >> 1; const int padding_w = (paddings[0] + 1) >> 1;
const int align_h = stride - 1 - padding_h; const int align_h = stride - 1 - padding_h;
const int align_w = stride - 1 - padding_w; const int align_w = stride - 1 - padding_w;
...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel,
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP:break; case NOOP:
case RELU:built_options.emplace("-DUSE_RELU");
break; break;
case RELUX:built_options.emplace("-DUSE_RELUX"); case RELU:
built_options.emplace("-DUSE_RELU");
break; break;
case TANH:built_options.emplace("-DUSE_TANH"); case RELUX:
built_options.emplace("-DUSE_RELUX");
break; break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID"); case TANH:
built_options.emplace("-DUSE_TANH");
break; break;
default:LOG(FATAL) << "Unknown activation type: " << activation; case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
...@@ -150,16 +156,19 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -150,16 +156,19 @@ void Deconv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
template <typename T> template <typename T>
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *filter, const Tensor *input,
const Tensor *bias, const Tensor *filter,
Tensor *output, const Tensor *bias,
StatsFuture *future) { Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (output_shape_.size() == 4) { if (output_shape_.size() == 4) {
paddings_.clear(); paddings_.clear();
paddings_ = std::vector<int>(2, 0); paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize( CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
input->shape().data(), strides_, padding_type_, output_shape_.data(),
filter->shape().data(), paddings_.data());
strides_, padding_type_,
output_shape_.data(),
paddings_.data());
} else { } else {
output_shape_.clear(); output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0); output_shape_ = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(), CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
filter->shape().data(), strides_, output_shape_.data(), paddings_.data());
strides_,
output_shape_.data(),
paddings_.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape_, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape_, output_image_shape));
Deconv2dOpencl(&kernel_, input, filter, bias, return Deconv2dOpencl(&kernel_, input, filter, bias, strides_[0],
strides_[0], paddings_.data(), paddings_.data(), activation_, relux_max_limit_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_, output, future,
DataTypeToEnum<T>::value, &input_shape_, &kwg_size_, &kernel_error_);
output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
} }
template struct Deconv2dFunctor<DeviceType::GPU, float>; template struct Deconv2dFunctor<DeviceType::GPU, float>;
......
...@@ -40,7 +40,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,7 +40,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width * block_size_; output_width = input_width * block_size_;
output_depth = input_depth / (block_size_ * block_size_); output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:") MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth; << output_depth;
kernel_name = "depth_to_space"; kernel_name = "depth_to_space";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
...@@ -53,7 +53,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -53,7 +53,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width / block_size_; output_width = input_width / block_size_;
output_depth = input_depth * block_size_ * block_size_; output_depth = input_depth * block_size_ * block_size_;
MACE_CHECK(input_depth % 4 == 0, "input channel not support:") MACE_CHECK(input_depth % 4 == 0, "input channel not support:")
<< input_depth; << input_depth;
kernel_name = "space_to_depth"; kernel_name = "space_to_depth";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
runtime->BuildKernel("depth_to_space", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -107,7 +106,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,7 +106,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -24,8 +24,7 @@ namespace kernels { ...@@ -24,8 +24,7 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize; uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
...@@ -40,9 +39,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -40,9 +39,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
} }
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
(cache_size / kernel_cache_size / lws_size) * 4, gws[2]);
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = gws[2]; lws[2] = gws[2];
} }
...@@ -52,21 +50,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -52,21 +50,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
static void DepthwiseConv2d(cl::Kernel *kernel, static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const int *paddings, const int *paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -149,7 +147,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -149,7 +147,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", std::string tuning_key =
gws[0], gws[1], gws[2], multiplier); Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) { if (strides_[0] != strides_[1]) {
...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), return DepthwiseConv2d(
dilations_, activation_, relux_max_limit_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
DataTypeToEnum<T>::value, &input_shape_, output, future, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
&kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
} }
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>; template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
......
...@@ -22,16 +22,15 @@ namespace kernels { ...@@ -22,16 +22,15 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
bool swapped = false; bool swapped = false;
if (input1 != nullptr) { if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size() MACE_CHECK(input0->dim_size() == input1->dim_size() ||
|| input0->dim_size() == 1 input0->dim_size() == 1 || input1->dim_size() == 1)
|| input1->dim_size() == 1) << "Inputs of Eltwise op must be same shape";
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) { if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) { if (input0->size() < input1->size()) {
std::swap(input0, input1); std::swap(input0, input1);
...@@ -39,28 +38,26 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -39,28 +38,26 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
} }
if (input1->dim_size() == 1) { if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(3) == input1->dim(0)) MACE_CHECK(input0->dim(3) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast"; << "Element-Wise op only support channel dimension broadcast";
} else { } else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) && MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(1) == 1 && input1->dim(2) == 1)
input1->dim(2) == 1) << "Element-Wise op only support channel dimension broadcast";
<< "Element-Wise op only support channel dimension broadcast";
} }
} }
} }
std::vector<index_t > output_shape(4); std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0); output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1); output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2); output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3); output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -115,7 +112,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -115,7 +112,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -20,18 +20,18 @@ namespace kernels { ...@@ -20,18 +20,18 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T>
void FCWXKernel(cl::Kernel *kernel, MaceStatus FCWXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future, StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -115,7 +115,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -115,7 +115,7 @@ void FCWXKernel(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
...@@ -170,21 +170,23 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -170,21 +170,23 @@ void FCWXKernel(cl::Kernel *kernel,
} }
}; };
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
void FCWTXKernel(cl::Kernel *kernel, MaceStatus FCWTXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future, StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1); MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t kwg_size = uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
*lws = {16, kwg_size/16, 0}; *lws = {16, kwg_size / 16, 0};
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -246,7 +248,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -246,7 +248,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel,
} }
std::string tuning_key = std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(1), output->dim(2), output->dim(3)); output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
return MACE_SUCCESS;
} }
} // namespace } // namespace
...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( ...@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
return MACE_SUCCESS; return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
} }
template struct FullyConnectedFunctor<DeviceType::GPU, float>; template struct FullyConnectedFunctor<DeviceType::GPU, float>;
......
...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size); lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws; return lws;
...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 4) MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+3); std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3]; uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size; uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size)); gws2 = (internal_gws[2] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<std::vector<uint32_t>> results; std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = { std::vector<std::vector<uint32_t>> candidates = {
{kwg_size / 2, 2, 0}, {kwg_size / 2, 2, 0}, {kwg_size / 4, 4, 0},
{kwg_size / 4, 4, 0}, {kwg_size / 8, 8, 0}, {kwg_size / 16, 16, 0},
{kwg_size / 8, 8, 0}, {kwg_size / 32, 32, 0}, {kwg_size / 64, 64, 0},
{kwg_size / 16, 16, 0}, {kwg_size / 128, 128, 0}, {kwg_size / 256, 256, 0},
{kwg_size / 32, 32, 0}, {kwg_size, 1, 0}, {1, kwg_size, 0}};
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}
};
for (auto &ele : candidates) { for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2]; const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) { if (0 < tmp && tmp <= kwg_size) {
...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 3) MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+2); std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
if (timer == nullptr) { if (timer == nullptr) {
uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2]; uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], const uint32_t num_blocks =
block_size); RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size); num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size; uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported() if (runtime->IsNonUniformWorkgroupsSupported() &&
&& (i == num_blocks - 1)) { (i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size)); gws1 = (internal_gws[1] - (i * block_size));
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -88,10 +88,9 @@ inline bool LimitKernelTime() { ...@@ -88,10 +88,9 @@ inline bool LimitKernelTime() {
} }
template <typename T> template <typename T>
bool IsVecEqual(const std::vector<T> &input0, bool IsVecEqual(const std::vector<T> &input0, const std::vector<T> &input1) {
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) && return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin()))); (std::equal(input0.begin(), input0.end(), input1.begin())));
} }
template <typename T> template <typename T>
......
...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
const BufferType type, const BufferType type,
Tensor *buffer, Tensor *buffer,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape); CalImage2DShape(image->shape(), type, &image_shape);
MACE_FAILURE_RETURN(buffer->Resize(image->shape())); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -100,7 +99,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +99,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
} }
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t inner_size = const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
...@@ -22,14 +22,14 @@ namespace kernels { ...@@ -22,14 +22,14 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_FAILURE_RETURN(C->ResizeImage(c_shape, c_image_shape)); MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t batch = C->dim(0); const index_t batch = C->dim(0);
const index_t height = C->dim(1); const index_t height = C->dim(1);
...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -71,7 +71,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -71,7 +71,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
Concat("matmul_opencl_kernel", C->dim(0), C->dim(2), C->dim(3));
C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>( kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0; *(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap(); kernel_error->UnMap();
...@@ -70,7 +70,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -70,7 +70,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer()))); *(static_cast<cl::Buffer *>(kernel_error->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
bool is_out_of_range = false; bool is_out_of_range = false;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
is_out_of_range = is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false;
*(kernel_error->mutable_data<char>()) == 1 ? true : false;
kernel_error->UnMap(); kernel_error->UnMap();
} }
return is_out_of_range; return is_out_of_range;
...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test { class OutOfRangeCheckTest : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() { setenv("OUT_OF_RANGE_CHECK", "1", 1); }
setenv("OUT_OF_RANGE_CHECK", "1", 1);
}
}; };
TEST(OutOfRangeCheckTest, RandomTest) { TEST(OutOfRangeCheckTest, RandomTest) {
...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) { ...@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels}; std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws; Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer", Tensor *buffer =
GetDeviceAllocator(DeviceType::GPU), ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape); buffer->Resize(buffer_shape);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
......
...@@ -20,26 +20,25 @@ ...@@ -20,26 +20,25 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *input, Tensor *output,
Tensor *output, StatsFuture *future) {
StatsFuture *future) { MACE_CHECK(this->paddings_.size() ==
MACE_CHECK( static_cast<size_t>((input->dim_size() * 2)));
this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2))); MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
&& (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) << "Mace only support height/width dimension now";
<< "Mace only support height/width dimension now";
auto input_shape = input->shape(); auto input_shape = input->shape();
std::vector<index_t> std::vector<index_t> output_shape = {
output_shape = {input_shape[0] + this->paddings_[0] + this->paddings_[1], input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3], input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5], input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]}; input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(2), output->dim(3));
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct PadFunctor<DeviceType::GPU, float>;
struct PadFunctor<DeviceType::GPU, float>; template struct PadFunctor<DeviceType::GPU, half>;
template
struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,15 +23,13 @@ namespace kernels { ...@@ -23,15 +23,13 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base), lws[2] =
kwg_size / lws[1]); std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4; lws[0] = gws[0] / 4;
if (lws[0] == 0) { if (lws[0] == 0) {
...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T> template <typename T>
MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t out_height = output->dim(1); index_t out_height = output->dim(1);
...@@ -125,7 +123,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -125,7 +123,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -23,11 +23,9 @@ namespace mace { ...@@ -23,11 +23,9 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
...@@ -110,7 +108,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -110,7 +108,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
...@@ -29,14 +29,15 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -29,14 +29,15 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const size_t outputs_count = output_list.size(); const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count; const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0) MACE_CHECK(output_channels % 4 == 0)
<< "output channels of slice op must be divisible by 4"; << "output channels of slice op must be divisible by 4";
std::vector<index_t> output_shape({input->dim(0), input->dim(1), std::vector<index_t> output_shape(
input->dim(2), output_channels}); {input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i = 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE=" +
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blk = RoundUpDiv4(output_channels); const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
...@@ -80,7 +80,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -80,7 +80,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->is_profiling_enabled()) { if (runtime->is_profiling_enabled()) {
CallStats tmp_stats; CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats); runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros =
call_stats.start_micros); std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
} }
} }
...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template template struct SliceFunctor<DeviceType::GPU, float>;
struct SliceFunctor<DeviceType::GPU, float>; template struct SliceFunctor<DeviceType::GPU, half>;
template
struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -24,10 +24,8 @@ namespace kernels { ...@@ -24,10 +24,8 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
const uint32_t kwg_size) { uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T> template <typename T>
MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = logits->dim(0); const index_t batch = logits->dim(0);
const index_t height = logits->dim(1); const index_t height = logits->dim(1);
const index_t width = logits->dim(2); const index_t width = logits->dim(2);
...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -88,7 +86,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -88,7 +86,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0), Concat("softmax_opencl_kernel", output->dim(0), output->dim(1),
output->dim(1), output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -26,17 +26,13 @@ namespace kernels { ...@@ -26,17 +26,13 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
Tensor *batch_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
if (b2s_) { if (b2s_) {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
DataFormat::NHWC,
output_shape.data()); output_shape.data());
} }
...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
if (b2s_) { if (b2s_) {
MACE_FAILURE_RETURN(space_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); space_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "batch_to_space"; kernel_name = "batch_to_space";
} else { } else {
MACE_FAILURE_RETURN(batch_tensor->ResizeImage(output_shape, MACE_RETURN_IF_ERROR(
output_image_shape)); batch_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "space_to_batch"; kernel_name = "space_to_batch";
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
runtime->BuildKernel("space_to_batch", built_options);
obfuscated_kernel_name, built_options);
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -92,7 +87,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -92,7 +87,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -24,7 +24,6 @@ namespace kernels { ...@@ -24,7 +24,6 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -78,12 +77,12 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -78,12 +77,12 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key = Concat("winograd_transform_kernel",
Concat("winograd_transform_kernel", output_tensor->dim(0), output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(2), output_tensor->dim(3));
output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1); MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -188,14 +185,14 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -188,14 +185,14 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
input_tensor->dim(1)}; input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
} }
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
......
...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase { ...@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase {
MACE_CHECK( MACE_CHECK(
this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2); this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
auto input_shape = input->shape(); auto input_shape = input->shape();
MACE_FAILURE_RETURN(output->Resize({input_shape[0] + this->paddings_[0] MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
+ this->paddings_[1], + this->paddings_[1],
input_shape[1] + this->paddings_[2] input_shape[1] + this->paddings_[2]
+ this->paddings_[3], + this->paddings_[3],
......
...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
RoundType::CEIL, RoundType::CEIL,
output_shape.data()); output_shape.data());
} }
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor); Tensor::MappingGuard output_guard(output_tensor);
......
...@@ -267,7 +267,7 @@ struct ProposalFunctor { ...@@ -267,7 +267,7 @@ struct ProposalFunctor {
// Our RPN implementation only supports a single input image, so all // Our RPN implementation only supports a single input image, so all
// batch inds are 0 // batch inds are 0
size = static_cast<int>(nms_result.size()); size = static_cast<int>(nms_result.size());
MACE_FAILURE_RETURN(output->Resize({size, 1, 1, 5})); MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5}));
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<float>();
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
......
...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor { ...@@ -50,7 +50,7 @@ struct PSROIAlignFunctor {
const index_t num_rois = rois->dim(0); const index_t num_rois = rois->dim(0);
const index_t batch_size = input->dim(0); const index_t batch_size = input->dim(0);
MACE_FAILURE_RETURN(output->Resize({num_rois, pooled_height, pooled_width, MACE_RETURN_IF_ERROR(output->Resize({num_rois, pooled_height, pooled_width,
output_dim_})); output_dim_}));
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
......
...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
index_t out_width = out_width_; index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width}; std::vector<index_t> out_shape{batch, channels, out_height, out_width};
MACE_FAILURE_RETURN(output->Resize(out_shape)); MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
......
...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase { ...@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
1, 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i= 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->Resize(output_shape)); MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape));
output_ptrs[i] = output_list[i]->mutable_data<T>(); output_ptrs[i] = output_list[i]->mutable_data<T>();
} }
const T *input_ptr = input->data<T>(); const T *input_ptr = input->data<T>();
......
...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(space_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
} else { } else {
CalculateSpaceToBatchOutputShape(space_tensor, CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_FAILURE_RETURN(batch_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
} }
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(space_tensor);
......
...@@ -15,7 +15,6 @@ cc_library( ...@@ -15,7 +15,6 @@ cc_library(
hdrs = [ hdrs = [
"ops_test_util.h", "ops_test_util.h",
], ],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
deps = [ deps = [
"//mace/core", "//mace/core",
"@gtest//:gtest", "@gtest//:gtest",
...@@ -36,18 +35,23 @@ cc_library( ...@@ -36,18 +35,23 @@ cc_library(
[ [
"buffer_to_image.cc", "buffer_to_image.cc",
"image_to_buffer.cc", "image_to_buffer.cc",
]), ],
),
hdrs = glob( hdrs = glob(
["*.h"], ["*.h"],
exclude = ["ops_test_util.h"], exclude = ["ops_test_util.h"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [ deps = [
"//mace/kernels", "//mace/kernels",
], ],
...@@ -60,13 +64,17 @@ cc_test( ...@@ -60,13 +64,17 @@ cc_test(
srcs = glob( srcs = glob(
["*_test.cc"], ["*_test.cc"],
), ),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
...@@ -80,13 +88,17 @@ cc_test( ...@@ -80,13 +88,17 @@ cc_test(
name = "ops_benchmark", name = "ops_benchmark",
testonly = 1, testonly = 1,
srcs = glob(["*_benchmark.cc"]), srcs = glob(["*_benchmark.cc"]),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
if_openmp_enabled(["-fopenmp"]) + "-DMACE_ENABLE_NEON",
if_neon_enabled(["-DMACE_ENABLE_NEON"]) + ]) + if_android_armv7([
if_android_armv7(["-mfpu=neon"]) + "-mfpu=neon",
if_android_armv7(["-mfloat-abi=softfp"]) + ]) + if_android_armv7([
if_android(["-DMACE_ENABLE_OPENCL"]) + "-mfloat-abi=softfp",
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), ]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
......
...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> { ...@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> {
functor_(kernels::StringToActivationType( functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
static_cast<T>(OperatorBase::GetOptionalArg<float>( static_cast<T>(
"max_limit", 0.0f))) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr; this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->Output(0); Tensor *output_tensor = this->Output(0);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensor)); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor));
return functor_(input_tensor, alpha_tensor, output_tensor, future); return functor_(input_tensor, alpha_tensor, output_tensor, future);
} }
......
...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { ...@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::GPU>(); TestUnalignedSimpleRelu<DeviceType::GPU>();
} }
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void TestSimpleRelux() { void TestSimpleRelux() {
...@@ -169,9 +168,7 @@ void TestSimpleRelux() { ...@@ -169,9 +168,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) { TEST_F(ActivationOpTest, OPENCLSimple) { TestSimpleRelux<DeviceType::GPU>(); }
TestSimpleRelux<DeviceType::GPU>();
}
namespace { namespace {
template <DeviceType D> template <DeviceType D>
...@@ -278,9 +275,7 @@ void TestSimplePrelu() { ...@@ -278,9 +275,7 @@ void TestSimplePrelu() {
} }
} // namespace } // namespace
TEST_F(ActivationOpTest, CPUSimplePrelu) { TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
TestSimplePrelu<DeviceType::CPU>();
}
TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::GPU>(); TestSimplePrelu<DeviceType::GPU>();
......
...@@ -97,8 +97,8 @@ void SimpleAdd3() { ...@@ -97,8 +97,8 @@ void SimpleAdd3() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 2, 3, 1}, auto expected =
{-0.000713, 8, 12, 16, 20, 24}); CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
} }
...@@ -160,8 +160,8 @@ void RandomTest() { ...@@ -160,8 +160,8 @@ void RandomTest() {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2, 1e-2); 1e-2);
} }
} }
} // namespace } // namespace
......
...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> { ...@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> {
var->dim_size()); var->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, mean, var, epsilon_, output, future); return functor_(input, scale, offset, mean, var, epsilon_, output, future);
} }
......
...@@ -22,7 +22,7 @@ namespace test { ...@@ -22,7 +22,7 @@ namespace test {
class BatchNormOpTest : public OpsTestBase {}; class BatchNormOpTest : public OpsTestBase {};
namespace { namespace {
template<DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
OpsTestNet net; OpsTestNet net;
...@@ -37,14 +37,14 @@ void Simple() { ...@@ -37,14 +37,14 @@ void Simple() {
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -62,14 +62,14 @@ void Simple() { ...@@ -62,14 +62,14 @@ void Simple() {
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -79,10 +79,9 @@ void Simple() { ...@@ -79,10 +79,9 @@ void Simple() {
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
0.8291, 0.8291, 3.1708, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
} }
...@@ -103,35 +102,31 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,35 +102,31 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -140,25 +135,25 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -140,25 +135,25 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -170,7 +165,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -170,7 +165,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -186,34 +181,30 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -186,34 +181,30 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -222,26 +213,26 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -222,26 +213,26 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -253,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -253,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
...@@ -269,34 +260,30 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -269,34 +260,30 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -305,25 +292,25 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -305,25 +292,25 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("OutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -335,7 +322,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -335,7 +322,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -351,34 +338,30 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -351,34 +338,30 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>("Input",
"Input", {batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW") .Input("InputNCHW")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
.Input("Mean") .Input("Mean")
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW") .Output("OutputNCHW")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// run cpu // run cpu
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -387,26 +370,26 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -387,26 +370,26 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("InputImage")
.Input("ScaleImage") .Input("ScaleImage")
.Input("OffsetImage") .Input("OffsetImage")
.Input("MeanImage") .Input("MeanImage")
.Input("VarImage") .Input("VarImage")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
...@@ -418,7 +401,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -418,7 +401,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
......
...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
Tensor *space_tensor = this->Output(OUTPUT); Tensor *space_tensor = this->Output(OUTPUT);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future);
future);
} }
private: private:
......
...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> { ...@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> {
bias->dim_size()); bias->dim_size());
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, bias, output, future); return functor_(input, bias, output, future);
} }
......
此差异已折叠。
...@@ -71,27 +71,27 @@ TEST(BufferToImageTest, ArgLarge) { ...@@ -71,27 +71,27 @@ TEST(BufferToImageTest, ArgLarge) {
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1}); {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3}); {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3}); {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedium) { TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128}); {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256}); {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::GPU, half>( TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
kernels::ARGUMENT, {2}, input_data); {2}, input_data);
} }
} // namespace test } // namespace test
......
...@@ -29,23 +29,19 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { ...@@ -29,23 +29,19 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input", {1, 1, 2, 8}, "Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NHWC,
"InputNCHW",
NCHW); NCHW);
// Construct graph // Construct graph
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("group", 4) .AddIntArg("group", 4)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(); net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NCHW,
"Output",
NHWC); NHWC);
// Check // Check
...@@ -65,7 +61,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -65,7 +61,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage") .Input("InputImage")
...@@ -78,7 +74,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -78,7 +74,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
// Transfer output // Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册