提交 2940ce5d 编写于 作者: 吴承辉

Merge branch 'mace-runtime-status' into 'master'

Return mace status for allocate

See merge request !537
......@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
}
} else {
#endif
MACE_FAILURE_RETURN(ws_->LoadModelTensor(
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
*net_def, device_type_, model_data));
// Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
NetMode::INIT);
MACE_FAILURE_RETURN(net->Run());
MACE_RETURN_IF_ERROR(net->Run());
net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
#ifdef MACE_ENABLE_HEXAGON
}
......@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions");
Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first));
input_tensor->Resize(input.second.shape());
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
......@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]);
} else {
#endif
MACE_FAILURE_RETURN(net_->Run(run_metadata));
MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
#ifdef MACE_ENABLE_HEXAGON
}
#endif
......
......@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
CallStats call_stats;
if (future_wait) {
StatsFuture future;
MACE_FAILURE_RETURN(op->Run(&future));
MACE_RETURN_IF_ERROR(op->Run(&future));
if (run_metadata != nullptr) {
future.wait_fn(&call_stats);
} else {
......@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
}
} else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros();
MACE_FAILURE_RETURN(op->Run(nullptr));
MACE_RETURN_IF_ERROR(op->Run(nullptr));
call_stats.end_micros = NowMicros();
} else {
MACE_FAILURE_RETURN(op->Run(nullptr));
MACE_RETURN_IF_ERROR(op->Run(nullptr));
}
if (run_metadata != nullptr) {
......
......@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} else {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type)));
MaceStatus status = tensor_buffer_->Allocate(model_data_size);
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size);
......@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() >= 20000) {
std::unique_ptr<BufferBase> image_buf(
new Image());
MaceStatus status = image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype);
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
}
......@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate(
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE);
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
......
......@@ -40,7 +40,7 @@ struct AddNFunctor {
Tensor *output_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensors[0]));
MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size();
Tensor::MappingGuard output_map(output_tensor);
float *output_data = output_tensor->mutable_data<float>();
......
......@@ -21,73 +21,73 @@ namespace mace {
namespace kernels {
void Conv2dNeonK1x1S1(const float *input,
const float *filter,
const index_t batch,
const index_t height,
const index_t width,
const index_t in_channels,
const index_t out_channels,
float *output);
const float *filter,
const index_t batch,
const index_t height,
const index_t width,
const index_t in_channels,
const index_t out_channels,
float *output);
void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK5x5S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK1x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x1S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK1x15S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK15x1S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
// calculate one output channel and one input channel
inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
......@@ -99,13 +99,13 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
const index_t out_width,
float *out_ptr,
const int stride) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) {
out_ptr[h * out_width + w]
+= in_ptr[(h * stride + i) * in_width + (w * stride + j)]
* filter_ptr[i * filter_width + j];
out_ptr[h * out_width + w] +=
in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
filter_ptr[i * filter_width + j];
}
}
}
......
......@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr,
for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
for (int i = 0; i < 15; ++i) {
for (int j = 0; j < 1; ++j) {
out_ptr[io * out_image_size + ih * out_width + w + iw]
+= in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)]
* filter_ptr[io * in_channels * 15 + i * 1 + j];
out_ptr[io * out_image_size + ih * out_width + w + iw] +=
in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
filter_ptr[io * in_channels * 15 + i * 1 + j];
}
}
}
}
}
// Ho = 4, Wo = 1, Co = 1
void Conv2dNeonK15x1S1(const float *input,
const float *filter,
......@@ -59,7 +58,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < out_shape[0]; ++b) {
......@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
float *out_ptr_base =
output + b * out_batch_size + m * out_image_size;
float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
......@@ -147,16 +145,16 @@ void Conv2dNeonK15x1S1(const float *input,
out_ptr_base[out_offset + 2 * out_width] = vo[2];
out_ptr_base[out_offset + 3 * out_width] = vo[3];
} // wt
} // h
} // h
#else
Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, out_width, w, tile_width,
out_image_size, out_ptr_base, 0, 1);
#endif
} // c
} // w
} // m
} // b
} // w
} // m
} // b
}
} // namespace kernels
......
......@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input,
const index_t out_channels,
float *output) {
for (index_t b = 0; b < batch; ++b) {
Gemm(filter,
input + b * in_channels * height * width,
1,
out_channels,
in_channels,
height * width,
Gemm(filter, input + b * in_channels * height * width, 1, out_channels,
in_channels, height * width,
output + b * out_channels * height * width);
}
}
......
......@@ -17,8 +17,8 @@
#endif
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr,
for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 15; ++j) {
out_ptr[io * out_image_size + (h + ih) * out_width + iw]
+= in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)]
* filter_ptr[io * in_channels * 15 + i * 15 + j];
out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
filter_ptr[io * in_channels * 15 + i * 15 + j];
}
}
}
}
}
// Ho = 1, Wo = 4, Co = 1
void Conv2dNeonK1x15S1(const float *input,
const float *filter,
......@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
float *out_ptr_base =
output + b * out_batch_size + m * out_image_size;
float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
......@@ -133,16 +131,16 @@ void Conv2dNeonK1x15S1(const float *input,
vst1q_f32(out_ptr_base + out_offset, vo);
} // w
} // ht
} // ht
#else
Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, h, tile_height, out_width,
out_image_size, out_ptr_base, 0, 1);
#endif
} // c
} // h
} // m
} // b
} // h
} // m
} // b
}
} // namespace kernels
......
......@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base =
output + b * out_batch_size + m * out_image_size;
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
......@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input,
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 7 + c * 7;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01;
float32x4_t vf10, vf11;
......@@ -174,7 +170,7 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr2_base + out_offset, vo2);
vst1q_f32(out_ptr3_base + out_offset, vo3);
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
......@@ -239,17 +235,16 @@ void Conv2dNeonK1x7S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0);
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 1, 7, out_height, out_width,
out_ptr0_base, 1);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
}
} // if
} // m
} // b
} // m
} // b
}
} // namespace kernels
......
......@@ -45,7 +45,7 @@ void Conv2dNeonK3x3S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output + b * out_batch_size + (m + 1) * out_image_size;
#endif
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
......@@ -54,11 +54,11 @@ void Conv2dNeonK3x3S1(const float *input,
#if defined(MACE_ENABLE_NEON)
float *out_ptr1 = out_ptr1_base;
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
#endif
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
......@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide
......@@ -179,7 +178,7 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
out_ptr1 += out_width;
} // h
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base;
......@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf167 = vld1_f32(filter_ptr1 + 6);
vf189 = vld1_f32(filter_ptr1 + 8);
for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide
......@@ -313,18 +311,18 @@ void Conv2dNeonK3x3S1(const float *input,
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float
*out_ptr0_base = output + b * out_batch_size + mm * out_image_size;
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float
*in_ptr0 = input + b * in_batch_size + c * in_image_size;
const float *in_ptr0 =
input + b * in_batch_size + c * in_image_size;
#if defined(MACE_ENABLE_NEON)
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
input + b * in_batch_size + c * in_image_size + 3 * in_width;
#endif
const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
......@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4;
in_ptr1 += 4;
in_ptr2 += 4;
......@@ -411,7 +408,7 @@ void Conv2dNeonK3x3S1(const float *input,
in_ptr3 += 2 + in_width;
out_ptr0 += out_width;
} // h
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
float *out_ptr0 = out_ptr0_base;
......@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4;
in_ptr1 += 4;
in_ptr2 += 4;
......@@ -499,15 +495,14 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0,
in_width, 3, 3, out_height, out_width,
out_ptr0_base, 1);
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
} // mm
} // if
} // m
} // b
}
void Conv2dNeonK3x3S2(const float *input,
......@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float
*filter_ptr = filter + m * in_channels * 9 + c * 9;
const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
......@@ -569,8 +563,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
......@@ -591,8 +585,8 @@ void Conv2dNeonK3x3S2(const float *input,
vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
vst1q_f32(out_base + out_offset, vo);
} // w
} // h
} // w
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x2_t vf01, vf23, vf45, vf67, vf78;
......@@ -631,8 +625,8 @@ void Conv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
......@@ -654,15 +648,14 @@ void Conv2dNeonK3x3S2(const float *input,
vst1q_f32(out_base + out_offset, vo);
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_base, filter_ptr,
in_width, 3, 3, out_height, out_width,
out_base, 2);
Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
out_width, out_base, 2);
#endif
} // c
} // m
} // b
} // m
} // b
}
} // namespace kernels
......
......@@ -21,59 +21,59 @@
namespace mace {
namespace kernels {
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \
\
/* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \
\
/* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \
\
/* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); \
\
/* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11, 1); \
\
/* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21, 1); \
\
/* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \
float32x2_t vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
#define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \
float32x2_t vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
// Ho = 1, Wo = 4, Co = 4
......@@ -99,7 +99,7 @@ void Conv2dNeonK5x5S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
......@@ -118,7 +118,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter + (m + 3) * in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
// input offset
index_t in_offset = h * in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3;
......@@ -157,7 +157,7 @@ void Conv2dNeonK5x5S1(const float *input,
filter_ptr2 -= 25;
filter_ptr3 -= 25;
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
......@@ -203,17 +203,16 @@ void Conv2dNeonK5x5S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 25;
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 5, 5, out_height, out_width,
out_ptr0_base, 1);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
} // mm
} // if
} // m
} // b
}
} // namespace kernels
......
......@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base =
output + b * out_batch_size + m * out_image_size;
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
......@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input,
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 7 + c * 7;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 7 + c * 7;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 4 height x 1 width) */
float32x4_t vf00, vf01;
float32x4_t vf10, vf11;
......@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width],
out_ptr3_base[out_offset + 3 * out_width]};
// input offset
index_t in_offset = h * in_width + w;
// input (3 slide)
......@@ -203,7 +198,7 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
......@@ -280,17 +275,16 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 7, 1, out_height, out_width,
out_ptr0_base, 1);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
}
} // if
} // m
} // b
} // m
} // b
}
} // namespace kernels
......
......@@ -21,136 +21,136 @@
namespace mace {
namespace kernels {
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); \
\
/* outch 1 */ \
vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \
vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); \
\
/* outch 2 */ \
vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \
vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); \
\
/* outch 3 */ \
vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \
vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); \
\
/* outch 1 */ \
vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \
vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); \
\
/* outch 2 */ \
vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \
vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); \
\
/* outch 3 */ \
vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \
vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); \
\
/* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); \
vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); \
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); \
\
/* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); \
vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); \
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); \
\
/* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); \
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 + 3); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 + 3); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); \
\
/* outch 1 */ \
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); \
vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); \
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); \
\
/* outch 2 */ \
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); \
vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); \
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); \
\
/* outch 3 */ \
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); \
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 + 3); \
\
/* outch 0 */ \
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
// Ho = 1, Wo = 4, Co = 4
......@@ -176,7 +176,7 @@ void Conv2dNeonK7x7S1(const float *input,
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
......@@ -195,7 +195,7 @@ void Conv2dNeonK7x7S1(const float *input,
filter + (m + 3) * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
// input offset
index_t in_offset = h * in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3;
......@@ -242,7 +242,7 @@ void Conv2dNeonK7x7S1(const float *input,
filter_ptr2 -= 49;
filter_ptr3 -= 49;
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
......@@ -296,17 +296,16 @@ void Conv2dNeonK7x7S1(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49;
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 7, 7, out_height, out_width,
out_ptr0_base, 1);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
} // mm
} // if
} // m
} // b
}
// Ho = 1, Wo = 4, Co = 4
......@@ -372,8 +371,8 @@ void Conv2dNeonK7x7S2(const float *input,
vvi0 = vld2q_f32(in_ptr_base + in_offset);
// [8.10.12.14, 9.11.13.15]
vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7]
vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7]
vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8]
vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9]
vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10]
......@@ -381,9 +380,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__)
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif
in_offset += in_width;
......@@ -403,7 +402,7 @@ void Conv2dNeonK7x7S2(const float *input,
filter_ptr2 -= 49;
filter_ptr3 -= 49;
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
......@@ -441,8 +440,8 @@ void Conv2dNeonK7x7S2(const float *input,
vvi0 = vld2q_f32(in_ptr_base + in_offset);
// [8.10.12.14, 9.11.13.15]
vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7]
vi0 = vvi0.val[0]; // [0.2.4.6]
vi1 = vvi0.val[1]; // [1.3.5.7]
vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8]
vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9]
vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10]
......@@ -462,17 +461,16 @@ void Conv2dNeonK7x7S2(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49;
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 7, 7, out_height, out_width,
out_ptr0_base, 2);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 2);
#endif
} // c
} // mm
} // if
} // m
} // b
} // mm
} // if
} // m
} // b
}
// Ho = 1, Wo = 4, Co = 4
......@@ -538,18 +536,18 @@ void Conv2dNeonK7x7S3(const float *input,
vvi0 = vld3q_f32(in_ptr_base + in_offset);
// [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11]
vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11]
vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12]
vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13]
vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14]
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__)
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif
in_offset += in_width;
......@@ -569,7 +567,7 @@ void Conv2dNeonK7x7S3(const float *input,
filter_ptr2 -= 49;
filter_ptr3 -= 49;
} // w
} // h
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
......@@ -607,9 +605,9 @@ void Conv2dNeonK7x7S3(const float *input,
vvi0 = vld3q_f32(in_ptr_base + in_offset);
// [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11]
vi0 = vvi0.val[0]; // [0.3.6.9]
vi1 = vvi0.val[1]; // [1.4.7.10]
vi2 = vvi0.val[2]; // [2.5.8.11]
vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12]
vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13]
vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14]
......@@ -628,17 +626,16 @@ void Conv2dNeonK7x7S3(const float *input,
vst1q_f32(out_ptr0_base + out_offset, vo0);
filter_ptr0 -= 49;
} // w
} // h
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0,
in_width, 7, 7, out_height, out_width,
out_ptr0_base, 3);
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 3);
#endif
} // c
} // mm
} // if
} // m
} // b
} // mm
} // if
} // m
} // b
}
} // namespace kernels
......
......@@ -17,8 +17,8 @@
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -44,14 +44,13 @@ void TransformInput4x4(const float *input,
for (index_t h = 0; h < in_height - 2; h += 2) {
for (index_t w = 0; w < in_width - 2; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15;
d15;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
s15;
// load tile data
const float *input_ptr =
input + n * input_batch_size + c * in_height_width + h * in_width
+ w;
const float *input_ptr = input + n * input_batch_size +
c * in_height_width + h * in_width + w;
d0 = input_ptr[0];
d1 = input_ptr[1];
d2 = input_ptr[2];
......@@ -92,7 +91,7 @@ void TransformInput4x4(const float *input,
// store output
float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index;
output + n * output_batch_size + c * tile_count + tile_index;
output_ptr[0] = s0;
output_ptr[1 * stride] = s1;
output_ptr[2 * stride] = s2;
......@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input,
float s[8][8];
for (index_t h = 0; h < in_height - 2; h += 6) {
for (index_t w = 0; w < in_width - 2; w += 6) {
const float *input_ptr =
input + n * input_batch_size + c * in_height_width + h * in_width
+ w;
const float *input_ptr = input + n * input_batch_size +
c * in_height_width + h * in_width + w;
for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
......@@ -203,7 +201,7 @@ void TransformInput8x8(const float *input,
}
float *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index;
output + n * output_batch_size + c * tile_count + tile_index;
for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
d0 = s[0][i];
......@@ -258,27 +256,18 @@ void BatchGemm(const float *input,
const index_t out_stride = out_channels * tile_count;
if (batch == 1) {
Gemm(filter,
input,
in_tile_area,
out_channels,
in_channels,
tile_count,
Gemm(filter, input, in_tile_area, out_channels, in_channels, tile_count,
output);
} else {
#pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) {
for (int i = 0; i < in_tile_area; ++i) {
const float
*in_ptr = input + b * in_batch_size + i * in_stride;
const float *in_ptr = input + b * in_batch_size + i * in_stride;
const float *filter_ptr = filter + i * filter_stride;
float *out_ptr = output + b * out_batch_size + i * out_stride;
Gemm(filter_ptr,
in_ptr,
1,
out_channels, /* rows */
in_channels, /* K */
tile_count, /* cols */
Gemm(filter_ptr, in_ptr, 1, out_channels, /* rows */
in_channels, /* K */
tile_count, /* cols */
out_ptr);
}
}
......@@ -305,12 +294,12 @@ void TransformOutput4x4(const float *input,
for (index_t h = 0; h < out_height; h += 2) {
for (index_t w = 0; w < out_width; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15;
d15;
float s0, s1, s2, s3, s4, s5, s6, s7;
float v0, v1, v2, v3;
const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset;
input + n * input_batch_size + m * tile_count + tile_offset;
d0 = input_ptr[0];
d1 = input_ptr[1 * stride];
d2 = input_ptr[2 * stride];
......@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input,
v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7;
float *output_ptr =
output + n * output_batch_size + m * out_image_size + h * out_width
+ w;
float *output_ptr = output + n * output_batch_size +
m * out_image_size + h * out_width + w;
output_ptr[0] = v0;
output_ptr[1] = v1;
output_ptr[out_width] = v2;
......@@ -403,7 +391,7 @@ void TransformOutput8x8(const float *input,
for (index_t h = 0; h < out_height; h += 6) {
for (index_t w = 0; w < out_width; w += 6) {
const float *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset;
input + n * input_batch_size + m * tile_count + tile_offset;
for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
......@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input,
input_ptr += 8 * stride;
}
float *output_ptr =
output + n * output_batch_size + m * out_image_size + h * out_width
+ w;
float *output_ptr = output + n * output_batch_size +
m * out_image_size + h * out_width + w;
for (int i = 0; i < 6; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
......@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input,
}
} // namespace
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter,
......@@ -485,7 +471,7 @@ void TransformFilter4x4(const float *filter,
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
......@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter,
float *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
const float G[8][3] = {{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}};
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
......@@ -612,7 +596,7 @@ void TransformFilter8x8(const float *filter,
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
}
}
}
......@@ -633,62 +617,38 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count;
switch (out_tile_size) {
case 2:
TransformInput4x4(input,
batch,
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
TransformInput4x4(input, batch, in_height, in_width, in_channels,
tile_count, transformed_input);
break;
case 6:
TransformInput8x8(input,
batch,
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
TransformInput8x8(input, batch, in_height, in_width, in_channels,
tile_count, transformed_input);
break;
default:MACE_NOT_IMPLEMENTED;
default:
MACE_NOT_IMPLEMENTED;
}
BatchGemm(transformed_input,
transformed_filter,
batch,
in_channels,
out_channels,
tile_count,
out_tile_size,
transformed_output);
BatchGemm(transformed_input, transformed_filter, batch, in_channels,
out_channels, tile_count, out_tile_size, transformed_output);
switch (out_tile_size) {
case 2:
TransformOutput4x4(transformed_output,
batch,
out_height,
out_width,
out_channels,
tile_count,
output);
TransformOutput4x4(transformed_output, batch, out_height, out_width,
out_channels, tile_count, output);
break;
case 6:
TransformOutput8x8(transformed_output,
batch,
out_height,
out_width,
out_channels,
tile_count,
output);
TransformOutput8x8(transformed_output, batch, out_height, out_width,
out_channels, tile_count, output);
break;
default:MACE_NOT_IMPLEMENTED;
default:
MACE_NOT_IMPLEMENTED;
}
}
......@@ -704,52 +664,39 @@ void WinoGradConv3x3s1(const float *input,
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
index_t tile_height_count =
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
index_t tile_width_count =
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
index_t transformed_input_size =
in_tile_area * batch * in_channels * tile_count;
in_tile_area * batch * in_channels * tile_count;
index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
index_t
transformed_output_size = in_tile_area * batch * out_channels * tile_count;
index_t transformed_output_size =
in_tile_area * batch * out_channels * tile_count;
float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size];
switch (out_tile_size) {
case 2:
TransformFilter4x4(filter,
in_channels,
out_channels,
transformed_filter);
TransformFilter4x4(filter, in_channels, out_channels, transformed_filter);
break;
case 6:
TransformFilter8x8(filter,
in_channels,
out_channels,
transformed_filter);
TransformFilter8x8(filter, in_channels, out_channels, transformed_filter);
break;
default:MACE_NOT_IMPLEMENTED;
default:
MACE_NOT_IMPLEMENTED;
}
WinoGradConv3x3s1(input,
transformed_filter,
batch,
in_height,
in_width,
in_channels,
out_channels,
out_tile_size,
transformed_input,
transformed_output,
output);
delete[]transformed_input;
delete[]transformed_filter;
delete[]transformed_output;
WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width,
in_channels, out_channels, out_tile_size, transformed_input,
transformed_output, output);
delete[] transformed_input;
delete[] transformed_filter;
delete[] transformed_output;
}
void ConvRef3x3s1(const float *input,
......@@ -769,7 +716,7 @@ void ConvRef3x3s1(const float *input,
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w;
((b * out_channels + m) * out_height + h) * out_width + w;
output[out_offset] = 0;
for (index_t c = 0; c < in_channels; ++c) {
for (index_t kh = 0; kh < 3; ++kh) {
......@@ -777,11 +724,10 @@ void ConvRef3x3s1(const float *input,
index_t ih = h + kh;
index_t iw = w + kw;
index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t
filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] +=
input[in_offset] * filter[filter_offset];
((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t filter_offset =
(((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] += input[in_offset] * filter[filter_offset];
}
}
}
......
......@@ -13,13 +13,13 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include <algorithm>
#include <memory>
#include <random>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/arm/conv_winograd.h"
namespace mace {
namespace kernels {
......@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
std::generate(input_data, input_data + input_size,
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
std::generate(filter_data, filter_data + filter_size,
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
std::generate(input_data, input_data + input_size, [&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
kernels::ConvRef3x3s1(input_data,
filter_data,
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data_ref);
kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
in_channels, out_channels, output_data_ref);
kernels::WinoGradConv3x3s1(input_data,
filter_data,
batch,
in_height,
in_width,
in_channels,
out_channels,
6,
kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
in_width, in_channels, out_channels, 6,
output_data);
// test
......
......@@ -32,15 +32,15 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
float *output);
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
const index_t valid_w_stop,
float *output);
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
const index_t valid_w_stop,
float *output);
} // namespace kernels
} // namespace mace
......
......@@ -16,8 +16,8 @@
#include <arm_neon.h>
#endif
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
#include "mace/core/macros.h"
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
namespace mace {
namespace kernels {
......@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t* in_shape,
const index_t* out_shape,
const int* pad_hw,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
}
}
......@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
// left
for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
}
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
......@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// right
for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
}
} // h
#else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
ih,
iw,
ih - pad_top,
iw - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih - pad_top,
iw - pad_left, out_width, in_height, in_width, 3,
3, out_base);
}
}
#endif
......@@ -275,29 +221,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// bottom
for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
w - pad_left, out_width, in_height, in_width, 3,
3, out_base);
}
}
} // m
} // b
} // b
}
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t* in_shape,
const index_t* out_shape,
const int* pad_hw,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
w * 2 - pad_left, out_width, in_height, in_width,
3, 3, out_base);
}
}
......@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
for (h = valid_h_start; h < valid_h_stop; ++h) {
// left
for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
w * 2 - pad_left, out_width, in_height, in_width,
3, 3, out_base);
}
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
......@@ -397,8 +316,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
......@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// right
for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
w * 2 - pad_left, out_width, in_height, in_width,
3, 3, out_base);
}
} // h
#else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
ih,
iw,
ih * 2 - pad_top,
iw * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih * 2 - pad_top,
iw * 2 - pad_left, out_width, in_height,
in_width, 3, 3, out_base);
}
}
#endif
......@@ -471,22 +372,13 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// bottom
for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
w * 2 - pad_left, out_width, in_height, in_width,
3, 3, out_base);
}
}
} // m
} // b
} // b
}
} // namespace kernels
......
......@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor {
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
......
......@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_);
}
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_ptr = output->mutable_data<T>();
......
......@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
RoundType::FLOOR,
output_shape.data());
}
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
index_t batch = output->dim(0);
index_t channels = output->dim(1);
......@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if (is_filter_transformed_) {
transformed_filter_ptr = filter_data;
} else {
MACE_FAILURE_RETURN(transformed_filter_.Resize(
MACE_RETURN_IF_ERROR(transformed_filter_.Resize(
transformed_filter_shape));
switch (winograd_out_tile_size) {
case 2:
......@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const Tensor *pad_input_ptr = input;
if (extra_input_height != input_height
|| extra_input_width != input_width) {
MACE_FAILURE_RETURN(ConstructNCHWInputWithSpecificPadding(input,
MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input,
pad_top,
pad_bottom,
pad_left,
......
......@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>();
......@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const int pad_width = pad_left + pad_right;
std::vector<index_t> output_shape(
{batch, channels, height + pad_height, width + pad_width});
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
output_tensor->Clear();
Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>();
......@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const int padded_top = paddings[0] / 2;
const int padded_left = paddings[1] / 2;
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>();
......
......@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_, padding_type_,
output_shape.data(),
paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
} else {
output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0);
......@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_,
output_shape_.data(),
paddings_.data(), true);
MACE_FAILURE_RETURN(output->Resize(output_shape_));
MACE_RETURN_IF_ERROR(output->Resize(output_shape_));
}
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
......
......@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor {
std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width};
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
......
......@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
RoundType::FLOOR,
output_shape.data());
}
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
output->Clear();
index_t batch = output->dim(0);
......
......@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
}
}
}
MACE_FAILURE_RETURN(output->ResizeLike(input0));
MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard output_guard(output);
......
......@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t N = output->dim(0);
const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
const index_t output_size = weight->dim(0);
......
......@@ -44,7 +44,7 @@ struct MatMulFunctor {
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
MACE_FAILURE_RETURN(C->Resize(c_shape));
MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B);
......
......@@ -21,12 +21,12 @@
namespace mace {
namespace kernels {
template<typename T>
MaceStatus ActivationFunctor<DeviceType::GPU,
T>::operator()(const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation_) {
case RELU:tuning_key_prefix_ = "relu_opencl_kernel";
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:tuning_key_prefix_ = "relux_opencl_kernel";
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:tuning_key_prefix_ = "prelu_opencl_kernel";
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:tuning_key_prefix_ = "tanh_opencl_kernel";
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:tuning_key_prefix_ = "sigmoid_opencl_kernel";
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
......@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
return MACE_SUCCESS;
}
template
struct ActivationFunctor<DeviceType::GPU, float>;
template
struct ActivationFunctor<DeviceType::GPU, half>;
template struct ActivationFunctor<DeviceType::GPU, float>;
template struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -71,7 +71,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
......@@ -87,13 +87,13 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape,
output_image_shape));
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......
......@@ -23,14 +23,15 @@ namespace mace {
namespace kernels {
template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) {
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0);
......@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -96,7 +97,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......
......@@ -23,9 +23,9 @@ namespace kernels {
template <typename T>
MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -67,7 +67,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0)
roundup_gws[i] = RoundUp(gws[i], lws[i]);
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
......
......@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
const BufferType type,
Tensor *image,
StatsFuture *future) {
std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
MACE_FAILURE_RETURN(image->ResizeImage(new_shape, image_shape));
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else {
MACE_FAILURE_RETURN(image->ResizeImage(buffer->shape(), image_shape));
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
}
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
......@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -107,7 +106,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
......@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) {
const index_t inner_size =
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
......@@ -16,18 +16,16 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_FAILURE_RETURN(output->ResizeLike(input));
const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
......@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0,
"groups must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
......@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options);
kernel_ =
runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -76,7 +73,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS;
}
template
struct ChannelShuffleFunctor<DeviceType::GPU, float>;
template
struct ChannelShuffleFunctor<DeviceType::GPU, half>;
template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
template struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -22,11 +22,9 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
......@@ -37,16 +35,15 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
static void Concat2(cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
static MaceStatus Concat2(cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -67,8 +64,8 @@ static void Concat2(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -95,7 +92,7 @@ static void Concat2(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -125,15 +122,17 @@ static void Concat2(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
static void ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -179,7 +178,7 @@ static void ConcatN(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->is_profiling_enabled()) {
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
call_stats.start_micros);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
......@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel,
}
};
}
return MACE_SUCCESS;
}
template <typename T>
......@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future, &kwg_size_, &kernel_error_);
break;
return Concat2(&kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&kwg_size_, &kernel_error_);
return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
future, &kwg_size_, &kernel_error_);
} else {
MACE_NOT_IMPLEMENTED;
}
......
......@@ -18,61 +18,61 @@
namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
template <typename T>
MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
......@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
return conv2d_func(
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
return Conv2dOpencl(
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
}
return MACE_SUCCESS;
}
template struct Conv2dFunctor<DeviceType::GPU, float>;
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
......@@ -25,11 +25,9 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
......@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8,
gws[2]);
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
......@@ -57,21 +54,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_UNUSED(padding);
MACE_UNUSED(dilations);
const index_t batch = output->dim(0);
......@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -145,7 +142,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
} // namespace kernels
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
......@@ -24,22 +24,20 @@ namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize,
4);
const uint32_t base =
std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base),
kwg_size / lws[1]);
lws[0] =
std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size /
lws_size / compute_units, base),
RoundUp<uint32_t>(
cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
......@@ -50,21 +48,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -129,7 +127,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
} // namespace kernels
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
......@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
......@@ -41,10 +40,10 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units)
* 8,
gws[2]);
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
lws_size / compute_units) *
8,
gws[2]);
if (lws[2] == 0) {
if (gws[2] < lws_limit) {
lws[2] = gws[2];
......@@ -58,21 +57,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -137,7 +136,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -168,11 +167,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
std::string tuning_key =
Concat("conv2d_general_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3),
filter->dim(2), filter->dim(3));
Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
} // namespace kernels
......
......@@ -20,20 +20,20 @@ namespace kernels {
namespace {
void Deconv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *paddings,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *paddings,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
#define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride;
((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1;
const int padding_w = (paddings[0]+1) >> 1;
const int padding_h = (paddings[0] + 1) >> 1;
const int padding_w = (paddings[0] + 1) >> 1;
const int align_h = stride - 1 - padding_h;
const int align_w = stride - 1 - padding_w;
......@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel,
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
case NOOP:
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case TANH:built_options.emplace("-DUSE_TANH");
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
case TANH:
built_options.emplace("-DUSE_TANH");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
......@@ -150,16 +156,19 @@ void Deconv2dOpencl(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
} // namespace
template <typename T>
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
......@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (output_shape_.size() == 4) {
paddings_.clear();
paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize(
input->shape().data(),
filter->shape().data(),
strides_, padding_type_,
output_shape_.data(),
paddings_.data());
CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
strides_, padding_type_, output_shape_.data(),
paddings_.data());
} else {
output_shape_.clear();
output_shape_ = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(),
filter->shape().data(),
strides_,
output_shape_.data(),
paddings_.data());
CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
strides_, output_shape_.data(), paddings_.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape_, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape_, output_image_shape));
Deconv2dOpencl(&kernel_, input, filter, bias,
strides_[0], paddings_.data(),
activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
return MACE_SUCCESS;
return Deconv2dOpencl(&kernel_, input, filter, bias, strides_[0],
paddings_.data(), activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
}
template struct Deconv2dFunctor<DeviceType::GPU, float>;
......
......@@ -40,7 +40,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width * block_size_;
output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth;
<< output_depth;
kernel_name = "depth_to_space";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
......@@ -53,7 +53,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
output_width = input_width / block_size_;
output_depth = input_depth * block_size_ * block_size_;
MACE_CHECK(input_depth % 4 == 0, "input channel not support:")
<< input_depth;
<< input_depth;
kernel_name = "space_to_depth";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
......@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global();
......@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name, built_options);
kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -107,7 +106,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......
......@@ -24,8 +24,7 @@ namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
......@@ -40,9 +39,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
}
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
if (lws[2] == 0) {
lws[2] = gws[2];
}
......@@ -52,21 +50,21 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
static void DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int stride,
const int *paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int stride,
const int *paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -149,7 +147,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
......@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
}
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel",
gws[0], gws[1], gws[2], multiplier);
std::string tuning_key =
Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
template <typename T>
......@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) {
......@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
return MACE_SUCCESS;
return DepthwiseConv2d(
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
}
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
......
......@@ -22,16 +22,15 @@ namespace kernels {
template <typename T>
MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
bool swapped = false;
if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size()
|| input0->dim_size() == 1
|| input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
MACE_CHECK(input0->dim_size() == input1->dim_size() ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
std::swap(input0, input1);
......@@ -39,28 +38,26 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
}
if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(3) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast";
<< "Element-Wise op only support channel dimension broadcast";
} else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) &&
input1->dim(1) == 1 &&
input1->dim(2) == 1)
<< "Element-Wise op only support channel dimension broadcast";
input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(2) == 1)
<< "Element-Wise op only support channel dimension broadcast";
}
}
}
std::vector<index_t > output_shape(4);
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape,
BufferType::IN_OUT_CHANNEL,
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -115,7 +112,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -20,18 +20,18 @@ namespace kernels {
namespace {
template <typename T>
void FCWXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MaceStatus FCWXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global();
......@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -115,7 +115,7 @@ void FCWXKernel(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
......@@ -170,21 +170,23 @@ void FCWXKernel(cl::Kernel *kernel,
}
};
}
return MACE_SUCCESS;
}
template <typename T>
void FCWTXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MaceStatus FCWTXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global();
......@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
(*kernel_error)->Allocate(1);
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
*lws = {16, kwg_size/16, 0};
*lws = {16, kwg_size / 16, 0};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
......@@ -246,7 +248,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
......@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel,
}
std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS;
}
} // namespace
......@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return MACE_SUCCESS;
return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
}
template struct FullyConnectedFunctor<DeviceType::GPU, float>;
......
......@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws;
......@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+3);
std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]);
......@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
if (timer == nullptr) {
uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2],
block_size);
const uint32_t num_blocks =
RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported()
&& (i == num_blocks - 1)) {
if (runtime->IsNonUniformWorkgroupsSupported() &&
(i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size));
}
error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported()
&& (i == num_blocks - 1)) {
if (runtime->IsNonUniformWorkgroupsSupported() &&
(i == num_blocks - 1)) {
gws2 = (internal_gws[2] - (i * block_size));
}
error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = {
{kwg_size / 2, 2, 0},
{kwg_size / 4, 4, 0},
{kwg_size / 8, 8, 0},
{kwg_size / 16, 16, 0},
{kwg_size / 32, 32, 0},
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}
};
{kwg_size / 2, 2, 0}, {kwg_size / 4, 4, 0},
{kwg_size / 8, 8, 0}, {kwg_size / 16, 16, 0},
{kwg_size / 32, 32, 0}, {kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0}, {kwg_size / 256, 256, 0},
{kwg_size, 1, 0}, {1, kwg_size, 0}};
for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) {
......@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> internal_gws(gws, gws+2);
std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) {
internal_gws[i] = RoundUp(gws[i], params[i]);
......@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
if (timer == nullptr) {
uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1],
block_size);
const uint32_t num_blocks =
RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported()
&& (i == num_blocks - 1)) {
if (runtime->IsNonUniformWorkgroupsSupported() &&
(i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size));
}
error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = block_size;
if (runtime->IsNonUniformWorkgroupsSupported()
&& (i == num_blocks - 1)) {
if (runtime->IsNonUniformWorkgroupsSupported() &&
(i == num_blocks - 1)) {
gws1 = (internal_gws[1] - (i * block_size));
}
error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
} // namespace kernels
} // namespace mace
......@@ -88,10 +88,9 @@ inline bool LimitKernelTime() {
}
template <typename T>
bool IsVecEqual(const std::vector<T> &input0,
const std::vector<T> &input1) {
bool IsVecEqual(const std::vector<T> &input0, const std::vector<T> &input1) {
return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin())));
(std::equal(input0.begin(), input0.end(), input1.begin())));
}
template <typename T>
......
......@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
const BufferType type,
Tensor *buffer,
StatsFuture *future) {
std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape);
MACE_FAILURE_RETURN(buffer->Resize(image->shape()));
MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
......@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -100,7 +99,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
......@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t inner_size =
buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
......
......@@ -22,14 +22,14 @@ namespace kernels {
template <typename T>
MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
const Tensor *B,
Tensor *C,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_FAILURE_RETURN(C->ResizeImage(c_shape, c_image_shape));
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t batch = C->dim(0);
const index_t height = C->dim(1);
......@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -71,7 +71,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key =
Concat("matmul_opencl_kernel", C->dim(0),
C->dim(1), C->dim(2), C->dim(3));
std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap();
......@@ -70,7 +70,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer())));
*(static_cast<cl::Buffer *>(kernel_error->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
......@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
bool is_out_of_range = false;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error->Map(nullptr);
is_out_of_range =
*(kernel_error->mutable_data<char>()) == 1 ? true : false;
is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false;
kernel_error->UnMap();
}
return is_out_of_range;
......@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test {
protected:
virtual void SetUp() {
setenv("OUT_OF_RANGE_CHECK", "1", 1);
}
virtual void SetUp() { setenv("OUT_OF_RANGE_CHECK", "1", 1); }
};
TEST(OutOfRangeCheckTest, RandomTest) {
......@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer",
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
Tensor *buffer =
ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape);
std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image",
GetDeviceAllocator(DeviceType::GPU),
Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape);
......
......@@ -20,26 +20,25 @@
namespace mace {
namespace kernels {
template<typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(
this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0)
&& (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
template <typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
auto input_shape = input->shape();
std::vector<index_t>
output_shape = {input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("pad", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS;
}
template
struct PadFunctor<DeviceType::GPU, float>;
template
struct PadFunctor<DeviceType::GPU, half>;
template struct PadFunctor<DeviceType::GPU, float>;
template struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -23,15 +23,13 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
......@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T>
MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
......@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
......@@ -125,7 +123,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -23,11 +23,9 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
......@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
......@@ -110,7 +108,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -20,7 +20,7 @@
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const std::vector<Tensor *> &output_list,
......@@ -29,14 +29,15 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0)
<< "output channels of slice op must be divisible by 4";
std::vector<index_t> output_shape({input->dim(0), input->dim(1),
input->dim(2), output_channels});
<< "output channels of slice op must be divisible by 4";
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->ResizeImage(output_shape, image_shape));
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = OpenCLRuntime::Global();
......@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
......@@ -80,7 +80,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->is_profiling_enabled()) {
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
call_stats.start_micros);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
......@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS;
}
template
struct SliceFunctor<DeviceType::GPU, float>;
template
struct SliceFunctor<DeviceType::GPU, half>;
template struct SliceFunctor<DeviceType::GPU, float>;
template struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -24,10 +24,8 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
......@@ -45,8 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
template <typename T>
MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output,
StatsFuture *future) {
Tensor *output,
StatsFuture *future) {
const index_t batch = logits->dim(0);
const index_t height = logits->dim(1);
const index_t width = logits->dim(2);
......@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -88,7 +86,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("softmax_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -26,17 +26,13 @@ namespace kernels {
template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0);
if (b2s_) {
CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NHWC,
CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
output_shape.data());
} else {
CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NHWC,
CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
output_shape.data());
}
......@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
if (b2s_) {
MACE_FAILURE_RETURN(space_tensor->ResizeImage(output_shape,
output_image_shape));
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "batch_to_space";
} else {
MACE_FAILURE_RETURN(batch_tensor->ResizeImage(output_shape,
output_image_shape));
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "space_to_batch";
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
......@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name, built_options);
kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -92,7 +87,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......
......@@ -24,7 +24,6 @@ namespace kernels {
template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
......@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -78,12 +77,12 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape));
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key =
Concat("winograd_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3));
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
......@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
kernel_error_->Allocate(1);
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -188,14 +185,14 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape));
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
......
......@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase {
MACE_CHECK(
this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
auto input_shape = input->shape();
MACE_FAILURE_RETURN(output->Resize({input_shape[0] + this->paddings_[0]
MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
+ this->paddings_[1],
input_shape[1] + this->paddings_[2]
+ this->paddings_[3],
......
......@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
RoundType::CEIL,
output_shape.data());
}
MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor);
......
......@@ -267,7 +267,7 @@ struct ProposalFunctor {
// Our RPN implementation only supports a single input image, so all
// batch inds are 0
size = static_cast<int>(nms_result.size());
MACE_FAILURE_RETURN(output->Resize({size, 1, 1, 5}));
MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5}));
auto output_ptr = output->mutable_data<float>();
#pragma omp parallel for
for (int i = 0; i < size; ++i) {
......
......@@ -50,7 +50,7 @@ struct PSROIAlignFunctor {
const index_t num_rois = rois->dim(0);
const index_t batch_size = input->dim(0);
MACE_FAILURE_RETURN(output->Resize({num_rois, pooled_height, pooled_width,
MACE_RETURN_IF_ERROR(output->Resize({num_rois, pooled_height, pooled_width,
output_dim_}));
T *output_ptr = output->mutable_data<T>();
......
......@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width};
MACE_FAILURE_RETURN(output->Resize(out_shape));
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output);
......
......@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
1,
std::multiplies<index_t>());
for (size_t i= 0; i < outputs_count; ++i) {
MACE_FAILURE_RETURN(output_list[i]->Resize(output_shape));
MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape));
output_ptrs[i] = output_list[i]->mutable_data<T>();
}
const T *input_ptr = input->data<T>();
......
......@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NCHW,
output_shape.data());
MACE_FAILURE_RETURN(space_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
} else {
CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW,
output_shape.data());
MACE_FAILURE_RETURN(batch_tensor->Resize(output_shape));
MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
}
Tensor::MappingGuard input_guard(space_tensor);
......
......@@ -15,7 +15,6 @@ cc_library(
hdrs = [
"ops_test_util.h",
],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
deps = [
"//mace/core",
"@gtest//:gtest",
......@@ -36,18 +35,23 @@ cc_library(
[
"buffer_to_image.cc",
"image_to_buffer.cc",
]),
],
),
hdrs = glob(
["*.h"],
exclude = ["ops_test_util.h"],
),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [
"//mace/kernels",
],
......@@ -60,13 +64,17 @@ cc_test(
srcs = glob(
["*_test.cc"],
),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
......@@ -80,13 +88,17 @@ cc_test(
name = "ops_benchmark",
testonly = 1,
srcs = glob(["*_benchmark.cc"]),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
......
......@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> {
functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")),
static_cast<T>(OperatorBase::GetOptionalArg<float>(
"max_limit", 0.0f))) {}
static_cast<T>(
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f))) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->Output(0);
MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensor));
MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor));
return functor_(input_tensor, alpha_tensor, output_tensor, future);
}
......
......@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::GPU>();
}
namespace {
template <DeviceType D>
void TestSimpleRelux() {
......@@ -169,9 +168,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) {
TestSimpleRelux<DeviceType::GPU>();
}
TEST_F(ActivationOpTest, OPENCLSimple) { TestSimpleRelux<DeviceType::GPU>(); }
namespace {
template <DeviceType D>
......@@ -278,9 +275,7 @@ void TestSimplePrelu() {
}
} // namespace
TEST_F(ActivationOpTest, CPUSimplePrelu) {
TestSimplePrelu<DeviceType::CPU>();
}
TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::GPU>();
......
......@@ -97,8 +97,8 @@ void SimpleAdd3() {
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 2, 3, 1},
{-0.000713, 8, 12, 16, 20, 24});
auto expected =
CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
}
......@@ -160,8 +160,8 @@ void RandomTest() {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2);
}
}
} // namespace
......
......@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> {
var->dim_size());
Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, mean, var, epsilon_, output, future);
}
......
......@@ -22,7 +22,7 @@ namespace test {
class BatchNormOpTest : public OpsTestBase {};
namespace {
template<DeviceType D>
template <DeviceType D>
void Simple() {
OpsTestNet net;
......@@ -37,14 +37,14 @@ void Simple() {
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -62,14 +62,14 @@ void Simple() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -79,10 +79,9 @@ void Simple() {
}
// Check
auto expected =
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125,
0.8291, 0.8291, 3.1708, 3.1708,
5.5125, 5.5125, 7.8543, 7.8543});
auto expected = CreateTensor<float>(
{1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
}
......@@ -103,35 +102,31 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -140,25 +135,25 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1);
......@@ -170,7 +165,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -186,34 +181,30 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -222,26 +213,26 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1);
......@@ -253,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
}
......@@ -269,34 +260,30 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -305,25 +292,25 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// tuning
setenv("MACE_TUNING", "1", 1);
......@@ -335,7 +322,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -351,34 +338,30 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -387,26 +370,26 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// tuning
setenv("MACE_TUNING", "1", 1);
......@@ -418,7 +401,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
}
......
......@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT);
Tensor *space_tensor = this->Output(OUTPUT);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor),
future);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future);
}
private:
......
......@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> {
bias->dim_size());
Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, bias, output, future);
}
......
......@@ -32,21 +32,17 @@ void BiasAddSimple() {
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputNCHW")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -79,9 +75,7 @@ void BiasAddSimple() {
TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
BiasAddSimple<DeviceType::GPU>();
}
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { BiasAddSimple<DeviceType::GPU>(); }
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// generate random input
......@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
......@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -124,9 +114,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage")
......@@ -139,7 +129,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
......@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
......@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
Tensor expected;
......@@ -183,9 +169,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage")
......@@ -198,7 +184,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
......
......@@ -71,27 +71,27 @@ TEST(BufferToImageTest, ArgLarge) {
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1});
{1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3});
{1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3});
{3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128});
{3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256});
{3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
......@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40,
};
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
kernels::ARGUMENT, {2}, input_data);
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
{2}, input_data);
}
} // namespace test
......
......@@ -29,23 +29,19 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -65,7 +61,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
......@@ -78,7 +74,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
// Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
// Check
auto expected = CreateTensor<float>(
......
......@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <functional>
#include <string>
#include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
......@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
int concat_axis_size = 0;
// Construct graph
std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
std::vector<const float*> input_ptrs(num_inputs);
std::vector<const float *> input_ptrs(num_inputs);
OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i);
......@@ -171,10 +171,10 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name,
shapes[i], inputs[i]);
net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
}
auto builder = OpDefBuilder("Concat", "ConcatTest");
......@@ -191,7 +191,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
// Check
auto output = net.GetOutput("Output");
......
......@@ -25,40 +25,36 @@ namespace test {
class Conv2dOpTest : public OpsTestBase {};
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -67,15 +63,15 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -91,41 +87,37 @@ void TestNHWCSimple3x3VALID() {
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -134,15 +126,15 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -155,8 +147,8 @@ void TestNHWCSimple3x3SAME() {
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -173,40 +165,36 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -214,14 +202,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
......@@ -247,45 +235,40 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestNHWCCombined3x3() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {2, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
"Filter", {2, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -295,15 +278,15 @@ void TestNHWCCombined3x3() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -315,8 +298,8 @@ void TestNHWCCombined3x3() {
// Check
auto expected = CreateTensor<float>(
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
......@@ -330,14 +313,13 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestFusedNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1});
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
......@@ -345,9 +327,7 @@ void TestFusedNHWCSimple3x3VALID() {
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
......@@ -362,10 +342,8 @@ void TestFusedNHWCSimple3x3VALID() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -398,25 +376,21 @@ void TestFusedNHWCSimple3x3VALID() {
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestFusedNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1});
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
......@@ -431,10 +405,8 @@ void TestFusedNHWCSimple3x3WithoutBias() {
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -478,48 +450,43 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
}
namespace {
template<DeviceType D>
template <DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {2, 5, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
"Filter", {2, 5, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -529,14 +496,14 @@ void TestConv1x1() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -548,13 +515,13 @@ void TestConv1x1() {
// Check
auto expected = CreateTensor<float>(
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -565,7 +532,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestComplexConvNxNS12(const std::vector<index_t> &shape,
const int stride) {
testing::internal::LogToStderr();
......@@ -584,33 +551,28 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
......@@ -625,22 +587,22 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected,
*net.GetOutput("OPENCLOutput"), 1e-4, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4);
};
for (int kernel_size : {1, 3, 5, 7}) {
......@@ -666,7 +628,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
}
namespace {
template<DeviceType D>
template <DeviceType D>
void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -690,40 +652,36 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
&float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData(
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data
net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data);
"Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w},
float_filter_data);
"Filter", {output_channels, input_channels, kernel_h, kernel_w},
float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
......@@ -738,23 +696,23 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected,
*net.GetOutput("OPENCLOutput"), 1e-2, 1e-1);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1);
};
func(1, 1, VALID);
......@@ -767,20 +725,16 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 64},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 63},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 64}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 63}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x7S1) {
......@@ -800,55 +754,45 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x1S1) {
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x7S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 63},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 63}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {15, 1, 32, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2}, {1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {1, 15, 32, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
{1, 1});
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16},
{2, 2});
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16}, {2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16},
{2, 2});
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16}, {2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16},
{4, 4});
TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16}, {4, 4});
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestDilationConvNxN(const std::vector<index_t> &shape,
const int dilation_rate) {
testing::internal::LogToStderr();
......@@ -868,33 +812,28 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
......@@ -909,22 +848,22 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-4, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4);
};
for (int kernel_size : {3}) {
......@@ -949,7 +888,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
}
namespace {
template<DeviceType D>
template <DeviceType D>
void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -975,9 +914,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -993,10 +930,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
......@@ -1024,8 +959,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1);
};
func(1, 1, VALID);
......@@ -1034,17 +969,16 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3},
{2, 2});
TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3}, {2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
{2, 2});
{2, 2});
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) {
testing::internal::LogToStderr();
......@@ -1063,31 +997,27 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
......@@ -1102,21 +1032,21 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-4, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4);
};
for (int kernel_size : {3, 5, 7}) {
......@@ -1132,8 +1062,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
}
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
{2, 2});
TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, {2, 2});
}
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
......
......@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))),
paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")),
dilations_(
OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
dilations_(OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
protected:
std::vector<int> strides_;
......
......@@ -31,9 +31,8 @@ TEST(CoreTest, INIT_MODE) {
.AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input =
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3});
{
Tensor::MappingGuard input_mapper(input);
......
......@@ -25,7 +25,7 @@ namespace test {
class Deconv2dOpTest : public OpsTestBase {};
namespace {
template<DeviceType D>
template <DeviceType D>
void RunTestSimple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const int stride,
......@@ -40,10 +40,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
// Add input data
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
net.TransformDataFormat<D, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
......@@ -66,9 +63,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputNCHW")
......@@ -81,317 +76,165 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
}
auto expected = CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple3x3SAME_S1() {
RunTestSimple<D>({1, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1},
1,
Padding::SAME,
{0, 0},
{1, 3, 3, 3},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 3, 3, 3},
{4, 4, 4, 6, 6, 6, 4, 4, 4,
6, 6, 6, 9, 9, 9, 6, 6, 6,
4, 4, 4, 6, 6, 6, 4, 4, 4});
RunTestSimple<D>({1, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1},
1,
Padding::VALID,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 3, 3, 3},
{4, 4, 4, 6, 6, 6, 4, 4, 4,
6, 6, 6, 9, 9, 9, 6, 6, 6,
4, 4, 4, 6, 6, 6, 4, 4, 4});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::SAME,
{0, 0},
{1, 3, 3, 3},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3},
{54, 66, 78, 126, 147, 168, 130, 146, 162,
198, 225, 252, 405, 450, 495, 366, 399, 432,
354, 378, 402, 630, 669, 708, 502, 530, 558});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::SAME,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3},
{54, 66, 78, 126, 147, 168, 130, 146, 162,
198, 225, 252, 405, 450, 495, 366, 399, 432,
354, 378, 402, 630, 669, 708, 502, 530, 558});
RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, Padding::SAME,
{0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4});
RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, Padding::VALID,
{2, 2}, {0}, {3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4});
RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::SAME,
{0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3}, {54, 66, 78, 126, 147, 168, 130, 146, 162,
198, 225, 252, 405, 450, 495, 366, 399, 432,
354, 378, 402, 630, 669, 708, 502, 530, 558});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::SAME, {2, 2}, {0},
{3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 3, 3, 3},
{54, 66, 78, 126, 147, 168, 130, 146, 162, 198, 225, 252, 405, 450,
495, 366, 399, 432, 354, 378, 402, 630, 669, 708, 502, 530, 558});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple3x3SAME_S2() {
RunTestSimple<D>({1, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1},
2,
Padding::SAME,
{0, 0},
{1, 6, 6, 3},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 6, 6, 3},
{1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
RunTestSimple<D>({1, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1},
2,
Padding::SAME,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 5, 5, 3},
{1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
2,
Padding::SAME,
{0, 0},
{1, 6, 6, 3},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 6, 6, 3},
{1, 2, 3, 4, 5, 6, 9, 12, 15,
8, 10, 12, 17, 22, 27, 12, 15, 18,
10, 11, 12, 13, 14, 15, 36, 39, 42,
26, 28, 30, 62, 67, 72, 39, 42, 45,
23, 28, 33, 38, 43, 48, 96, 108, 120,
64, 71, 78, 148, 164, 180, 90, 99, 108,
40, 44, 48, 52, 56, 60, 114, 123, 132,
65, 70, 75, 140, 151, 162, 78, 84, 90,
83, 94, 105, 116, 127, 138, 252, 276, 300,
142, 155, 168, 304, 332, 360, 168, 183, 198,
70, 77, 84, 91, 98, 105, 192, 207, 222,
104, 112, 120, 218, 235, 252, 117, 126, 135});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
2,
Padding::SAME,
{2, 2},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{13, 14, 15, 36, 39, 42,
26, 28, 30, 62, 67, 72, 39, 42, 45,
38, 43, 48, 96, 108, 120,
64, 71, 78, 148, 164, 180, 90, 99, 108,
52, 56, 60, 114, 123, 132,
65, 70, 75, 140, 151, 162, 78, 84, 90,
116, 127, 138, 252, 276, 300,
142, 155, 168, 304, 332, 360, 168, 183, 198,
91, 98, 105, 192, 207, 222,
104, 112, 120, 218, 235, 252, 117, 126, 135});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {0, 0},
{1, 6, 6, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 6, 6, 3},
{1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4,
4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2,
2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {2, 2}, {0},
{3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 5, 5, 3}, {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4,
4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1,
1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4,
2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {0, 0},
{1, 6, 6, 3}, {3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 6, 6, 3},
{1, 2, 3, 4, 5, 6, 9, 12, 15, 8, 10, 12, 17, 22,
27, 12, 15, 18, 10, 11, 12, 13, 14, 15, 36, 39, 42, 26,
28, 30, 62, 67, 72, 39, 42, 45, 23, 28, 33, 38, 43, 48,
96, 108, 120, 64, 71, 78, 148, 164, 180, 90, 99, 108, 40, 44,
48, 52, 56, 60, 114, 123, 132, 65, 70, 75, 140, 151, 162, 78,
84, 90, 83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168,
304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192, 207,
222, 104, 112, 120, 218, 235, 252, 117, 126, 135});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {2, 2}, {0},
{3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{13, 14, 15, 36, 39, 42, 26, 28, 30, 62, 67, 72, 39,
42, 45, 38, 43, 48, 96, 108, 120, 64, 71, 78, 148, 164,
180, 90, 99, 108, 52, 56, 60, 114, 123, 132, 65, 70, 75,
140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300, 142,
155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105, 192, 207,
222, 104, 112, 120, 218, 235, 252, 117, 126, 135});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple3x3SAME_S2_1() {
RunTestSimple<D>({1, 3, 3, 1},
{12, 18, 12, 18, 27, 18, 12, 18, 12},
2,
Padding::SAME,
{0, 0},
{1, 5, 5, 3},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 5, 5, 3},
{12, 12, 12, 30, 30, 30, 18, 18, 18,
30, 30, 30, 12, 12, 12,
30, 30, 30, 75, 75, 75, 45, 45, 45,
75, 75, 75, 30, 30, 30,
18, 18, 18, 45, 45, 45, 27, 27, 27,
45, 45, 45, 18, 18, 18,
30, 30, 30, 75, 75, 75, 45, 45, 45,
75, 75, 75, 30, 30, 30,
12, 12, 12, 30, 30, 30, 18, 18, 18,
30, 30, 30, 12, 12, 12});
RunTestSimple<D>(
{1, 3, 3, 1}, {12, 18, 12, 18, 27, 18, 12, 18, 12}, 2, Padding::SAME,
{0, 0}, {1, 5, 5, 3}, {3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 5, 5, 3},
{12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12,
30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18,
30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple3x3VALID_S2() {
RunTestSimple<D>({1, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1},
2,
Padding::VALID,
{0, 0},
{1, 7, 7, 3},
{3, 3, 3, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 7, 7, 3},
{1, 1, 1, 1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,
4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,
4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1,
2, 2, 2, 1, 1, 1, 1, 1, 1});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::VALID, {0, 0},
{1, 7, 7, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 7, 7, 3},
{1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple3x3VALID_S1() {
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::VALID,
{0, 0},
{1, 5, 5, 3},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{1, 2, 3,
6, 9, 12,
18, 24, 30,
26, 31, 36,
21, 24, 27,
14, 19, 24,
54, 66, 78,
126, 147, 168,
130, 146, 162,
90, 99, 108,
66, 78, 90,
198, 225, 252,
405, 450, 495,
366, 399, 432,
234, 252, 270,
146, 157, 168,
354, 378, 402,
630, 669, 708,
502, 530, 558,
294, 309, 324,
133, 140, 147,
306, 321, 336,
522, 546, 570,
398, 415, 432,
225, 234, 243});
RunTestSimple<D>({1, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9},
1,
Padding::VALID,
{4, 4},
{0},
{3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{1, 2, 3,
6, 9, 12,
18, 24, 30,
26, 31, 36,
21, 24, 27,
14, 19, 24,
54, 66, 78,
126, 147, 168,
130, 146, 162,
90, 99, 108,
66, 78, 90,
198, 225, 252,
405, 450, 495,
366, 399, 432,
234, 252, 270,
146, 157, 168,
354, 378, 402,
630, 669, 708,
502, 530, 558,
294, 309, 324,
133, 140, 147,
306, 321, 336,
522, 546, 570,
398, 415, 432,
225, 234, 243});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {0, 0},
{1, 5, 5, 3}, {3, 3, 3, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{1, 2, 3, 6, 9, 12, 18, 24, 30, 26, 31, 36, 21,
24, 27, 14, 19, 24, 54, 66, 78, 126, 147, 168, 130, 146,
162, 90, 99, 108, 66, 78, 90, 198, 225, 252, 405, 450, 495,
366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402, 630,
669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147, 306, 321,
336, 522, 546, 570, 398, 415, 432, 225, 234, 243});
RunTestSimple<D>(
{1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {4, 4}, {0},
{3, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
{1, 5, 5, 3},
{1, 2, 3, 6, 9, 12, 18, 24, 30, 26, 31, 36, 21,
24, 27, 14, 19, 24, 54, 66, 78, 126, 147, 168, 130, 146,
162, 90, 99, 108, 66, 78, 90, 198, 225, 252, 405, 450, 495,
366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402, 630,
669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147, 306, 321,
336, 522, 546, 570, 398, 415, 432, 225, 234, 243});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple2x2SAME() {
RunTestSimple<D>({1, 2, 2, 1},
{1, 1, 1, 1},
1,
Padding::SAME,
{0, 0},
{1, 2, 2, 1},
{3, 3, 1, 1},
RunTestSimple<D>({1, 2, 2, 1}, {1, 1, 1, 1}, 1, Padding::SAME, {0, 0},
{1, 2, 2, 1}, {3, 3, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
{1, 2, 2, 1},
{4.f, 4.f, 4.f, 4.f});
{1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f});
}
template<DeviceType D>
template <DeviceType D>
void TestNHWCSimple2x2VALID() {
RunTestSimple<D>({1, 2, 2, 1},
{1, 1, 1, 1},
2,
Padding::VALID,
{0, 0},
{1, 5, 5, 1},
{3, 3, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
{1, 5, 5, 1},
{1.f, 1.f, 2.f, 1.f, 1.f,
1.f, 1.f, 2.f, 1.f, 1.f,
2.f, 2.f, 4.f, 2.f, 2.f,
1.f, 1.f, 2.f, 1.f, 1.f,
1.f, 1.f, 2.f, 1.f, 1.f});
RunTestSimple<D>(
{1, 2, 2, 1}, {1, 1, 1, 1}, 2, Padding::VALID, {0, 0}, {1, 5, 5, 1},
{3, 3, 1, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
{1, 5, 5, 1},
{1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f});
}
} // namespace
......@@ -400,11 +243,11 @@ TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S1) {
}
TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2) {
TestNHWCSimple3x3SAME_S2<DeviceType::CPU>();
TestNHWCSimple3x3SAME_S2<DeviceType::CPU>();
}
TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S2_1) {
TestNHWCSimple3x3SAME_S2_1<DeviceType::CPU>();
TestNHWCSimple3x3SAME_S2_1<DeviceType::CPU>();
}
TEST_F(Deconv2dOpTest, CPUSimple2X2PaddingSame) {
......@@ -432,11 +275,11 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S1) {
}
TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2) {
TestNHWCSimple3x3SAME_S2<DeviceType::GPU>();
TestNHWCSimple3x3SAME_S2<DeviceType::GPU>();
}
TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S2_1) {
TestNHWCSimple3x3SAME_S2_1<DeviceType::GPU>();
TestNHWCSimple3x3SAME_S2_1<DeviceType::GPU>();
}
TEST_F(Deconv2dOpTest, OPENCLSimple2X2PaddingValid) {
......@@ -452,7 +295,7 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) {
}
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestComplexDeconvNxNS12(const int batch,
const std::vector<int> &shape,
const int stride) {
......@@ -473,14 +316,12 @@ void TestComplexDeconvNxNS12(const int batch,
net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
int out_h = 0;
int out_w = 0;
std::vector<int>paddings;
std::vector<int> paddings;
std::vector<int> output_shape;
if (padding < 0) {
......@@ -496,8 +337,8 @@ void TestComplexDeconvNxNS12(const int batch,
output_shape.push_back(out_w);
output_shape.push_back(output_channels);
} else {
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
paddings.push_back(padding);
paddings.push_back(padding);
}
......@@ -514,14 +355,11 @@ void TestComplexDeconvNxNS12(const int batch,
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
......@@ -551,8 +389,8 @@ void TestComplexDeconvNxNS12(const int batch,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected,
*net.GetOutput("OPENCLOutput"), 1e-4, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4);
};
for (int kernel_size : {1, 3, 5, 7}) {
......@@ -575,8 +413,8 @@ TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) {
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
......
......@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s,
const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
// Construct graph
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW")
......@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else {
BufferToImage<D, float>(&net, "Input", "InputImage",
......@@ -64,118 +60,98 @@ void RunDepthToSpace(const bool d2s,
net.RunOp(D);
}
if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
}
auto expected = CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 4, 4},
RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2,
{1, 1, 2, 16},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
RunDepthToSpace<DeviceType::GPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2,
{1, 1, 2, 16},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
RunDepthToSpace<DeviceType::GPU>(
false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
class DepthToSpaceOpTest : public OpsTestBase {};
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 2, 16},
RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2,
{1, 2, 4, 4},
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2,
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
RunDepthToSpace<DeviceType::GPU>(
true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16});
RunDepthToSpace<DeviceType::GPU>(
true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0);
const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128},
in,
2,
{1, 384, 384, 32},
in);
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, in, 2,
{1, 384, 384, 32}, in);
}
namespace {
template <DeviceType D, typename T>
void RandomTest(const bool d2s, const int block_size,
void RandomTest(const bool d2s,
const int block_size,
const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
srand(time(NULL));
......@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size,
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder(ops_name, ops_test_name)
.Input("InputNCHW")
......@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size,
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
}
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
}
} // namespace test
......
......@@ -22,7 +22,7 @@ namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {};
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void SimpleValidTest() {
testing::internal::LogToStderr();
// Construct graph
......@@ -30,31 +30,27 @@ void SimpleValidTest() {
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
"Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
net.AddInputFromArray<D, float>(
"Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f});
"Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -63,15 +59,15 @@ void SimpleValidTest() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -85,8 +81,8 @@ void SimpleValidTest() {
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f,
67.1f, 268.2f, 77.1f, 308.2f});
{1, 2, 2, 2},
{37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
......@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
}
namespace {
template<DeviceType D, typename T>
void ComplexValidTest(index_t batch, index_t channel, index_t height,
index_t width, index_t kernel, index_t multiplier,
template <DeviceType D, typename T>
void ComplexValidTest(index_t batch,
index_t channel,
index_t height,
index_t width,
index_t kernel,
index_t multiplier,
int stride) {
testing::internal::LogToStderr();
// Construct graph
......@@ -125,35 +125,29 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
std::vector<float> filter_data(kernel * kernel * channel * multiplier);
GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
&filter_data);
net.AddInputFromArray<D, float>("Filter",
{multiplier, channel, kernel, kernel},
filter_data);
net.AddInputFromArray<D, float>(
"Filter", {multiplier, channel, kernel, kernel}, filter_data);
std::vector<float> bias_data(channel * multiplier);
GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
net.AddInputFromArray<D, float>("Bias", {channel * multiplier},
bias_data);
net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, bias_data);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -162,15 +156,15 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
}
}
auto expected = CreateTensor<T>(
{1, out_height, out_width, out_channels}, expect);
auto expected =
CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
......@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
}
namespace {
template<typename T>
template <typename T>
void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
......@@ -263,74 +257,66 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width,
input_channels});
net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
"Input", {batch, height, width, input_channels});
net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier
* input_channels});
{multiplier * input_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Filter")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net,
"OutputImage",
"DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
// Check
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"),
1e-5, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-4);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"),
1e-2, 1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
1e-2);
}
};
......@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) {
}
} // namespace
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
TestNxNS12<float>(4, 4);
}
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { TestNxNS12<float>(4, 4); }
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12<half>(4, 4);
......
......@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> {
public:
EltwiseOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(static_cast<kernels::EltwiseType>(
OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("x", 1.0)) {}
functor_(
static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("x", 1.0)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor* input0 = this->Input(0);
const Tensor* input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
Tensor *output = this->Output(OUTPUT);
return functor_(input0, input1, output, future);
}
......
......@@ -36,10 +36,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input", shape, input);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input",
NHWC,
"TInput",
NCHW);
net.TransformDataFormat<D, float>("Input", NHWC, "TInput", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
......@@ -48,13 +45,10 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("TOutput",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
} else {
BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg")
.AddIntArg("type", static_cast<int>(type))
......@@ -90,10 +84,8 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input1", shape1, input1);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input0", NHWC,
"TInput0", NCHW);
net.TransformDataFormat<D, float>("Input1", NHWC,
"TInput1", NCHW);
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
......@@ -104,13 +96,12 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("TOutput", NCHW,
"Output", NHWC);
net.TransformDataFormat<D, float>("TOutput", NCHW, "Output", NHWC);
} else {
BufferToImage<D, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0")
.Input("InputImg1")
......@@ -138,278 +129,181 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1,
{2});
{1, 1, 1, 1}, {1}, 1, {2});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 2, 3, 4, 5});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
2,
{2, 4, 6, 8, 10, 12});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
2, {2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3},
{2, 4, 6, 8, 10, 12},
2,
{1, 2, 3, 4, 5, 6});
{1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
2, {1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{1, 1, 1, 1, 1, 1});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{3, 3, 3, 4, 5, 6});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
3, {3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::ABS,
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
3, {-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::CPU, float>(
kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 4, 9, 16, 25});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {0, 1, 4, 9, 16, 25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1,
{2});
{1, 1, 1, 1}, {1}, 1, {2});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 2, 3, 4, 5});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
2,
{2, 4, 6, 8, 10, 12});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
2, {2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3},
{2, 4, 6, 8, 10, 12},
2,
{1, 2, 3, 4, 5, 6});
{1, 1, 2, 3}, {2, 4, 6, 8, 10, 12},
2, {1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{1, 1, 1, 1, 1, 1});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{3, 3, 3, 4, 5, 6});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
3, {3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS,
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
3, {-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::GPU, float>(
kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 4, 9, 16, 25});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
1, {0, 1, 4, 9, 16, 25});
}
TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 3}, {1, 2, 3},
{2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0,
-5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::PROD,
{1, 1, 1, 3}, {1, 2, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
{1, 1, 1, 5}, {1, 1, 1, 2, 4},
{1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25,
25});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
{1, 2, 3}, {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 3}, {1, 2, 3},
{2, 4, 6, 5, 7, 9});
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
{1, 2, 3}, {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD,
{1, 1, 1, 3}, {1, 2, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 4, 10, 18});
kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 1, 1, 5}, {1, 1, 1, 2, 4},
{1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{0.2, 0.4, 0.6, 0.8, 1, 1.2},
{0.1, 0.1});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::PROD,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::DIV,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MIN,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::MAX,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25,
25});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
{1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{0.2, 0.4, 0.6, 0.8, 1, 1.2},
{0.1, 0.1});
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0});
kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
{1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 1, 1, 1});
kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
namespace {
......@@ -422,9 +316,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput")
......@@ -434,15 +326,13 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg")
.AddIntArg("type", static_cast<int>(type))
......@@ -455,13 +345,12 @@ void RandomTensorScalar(const kernels::EltwiseType type,
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
}
}
......@@ -477,10 +366,10 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC,
"TInput0", NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC,
"TInput1", NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
......@@ -491,15 +380,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0")
.Input("InputImg1")
......@@ -513,13 +402,12 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
}
}
} // namespace
......@@ -549,88 +437,87 @@ TEST_F(EltwiseOpTest, RandomTensorScalarHalf) {
}
TEST_F(EltwiseOpTest, RandomTensorVecFloat) {
RandomTensorEltwise<float>(kernels::EltwiseType::SUM,
{1, 32, 32, 16}, {1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB,
{5, 32, 32, 16}, {5, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB,
{5, 32, 32, 16}, {1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB,
{5, 1, 1, 16}, {5, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD,
{1, 31, 37, 17}, {1, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD,
{1, 1, 1, 17}, {1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV,
{3, 1, 1, 17}, {3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN,
{1, 1, 1, 16}, {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX,
{5, 31, 37, 17}, {5, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF,
{5, 31, 37, 17}, {5, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
{5, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
{1, 1, 1, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 1, 1, 16},
{5, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
{1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {3, 1, 1, 17},
{3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
{1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {5, 31, 37, 17},
{5, 1, 1, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {5, 31, 37, 17},
{5, 1, 1, 17});
}
TEST_F(EltwiseOpTest, RandomTensorVecHalf) {
RandomTensorEltwise<half>(kernels::EltwiseType::SUM,
{1, 32, 32, 16}, {1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
{3, 32, 32, 16}, {3, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
{3, 32, 32, 16}, {1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
{3, 1, 1, 16}, {3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD,
{1, 1, 1, 17}, {1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
{5, 31, 37, 17}, {5, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
{5, 31, 37, 17}, {1, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
{5, 1, 1, 17}, {5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN,
{1, 1, 1, 16}, {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX,
{3, 31, 37, 17}, {3, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF,
{3, 31, 37, 17}, {3, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{1, 1, 1, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 1, 1, 16},
{3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
{1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{1, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 1, 1, 17},
{5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
{1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 1, 1, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 1, 1, 17});
}
TEST_F(EltwiseOpTest, RandomTensorTensorFloat) {
RandomTensorEltwise<float>(kernels::EltwiseType::SUM,
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB,
{3, 32, 32, 16}, {3, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD,
{1, 31, 37, 17}, {1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV,
{5, 31, 37, 17}, {5, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN,
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX,
{3, 31, 37, 17}, {3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF,
{3, 31, 37, 17}, {3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
{1, 32, 32, 16});
RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 31, 37, 17});
RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 31, 37, 17});
}
TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
RandomTensorEltwise<half>(kernels::EltwiseType::SUM,
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
{3, 32, 32, 16}, {3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD,
{1, 31, 37, 17}, {1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
{5, 31, 37, 17}, {5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN,
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX,
{3, 31, 37, 17}, {3, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF,
{3, 31, 37, 17}, {3, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
{1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
{3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
{1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
{5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
{1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
{3, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
{3, 31, 37, 17});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
offset->dim_size());
Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
}
......
......@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
}
}
template<DeviceType D>
template <DeviceType D>
void Simple() {
OpsTestNet net;
......@@ -52,11 +52,11 @@ void Simple() {
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
......@@ -69,11 +69,11 @@ void Simple() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -83,10 +83,9 @@ void Simple() {
}
// Check
auto expected =
CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125,
0.8291, 0.8291, 3.1708, 3.1708,
5.5125, 5.5125, 7.8543, 7.8543});
auto expected = CreateTensor<float>(
{1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
}
......@@ -108,29 +107,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -139,25 +134,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -173,29 +168,25 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -204,26 +195,26 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
}
......@@ -239,29 +230,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -270,24 +257,24 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -303,29 +290,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Scale")
.Input("Offset")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -334,25 +317,25 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
}
......
......@@ -23,15 +23,15 @@
namespace mace {
namespace ops {
template<DeviceType D, class T>
template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> {
public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
: Operator<D, T>(operator_def, ws),
functor_(kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
if (D == DeviceType::CPU) {
MACE_CHECK(input->dim(1) == weight->dim(1)
&& input->dim(2) == weight->dim(2)
&& input->dim(3) == weight->dim(3)
&& weight->dim(0) == bias->dim(0),
"The shape of Input: ",
MakeString(input->shape()),
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
MACE_CHECK(
input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
input->dim(3) == weight->dim(3) && weight->dim(0) == bias->dim(0),
"The shape of Input: ", MakeString(input->shape()),
"The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
bias->dim(0), " don't match.");
} else {
MACE_CHECK(input->dim(1) == weight->dim(2)
&& input->dim(2) == weight->dim(3)
&& input->dim(3) == weight->dim(1)
&& weight->dim(0) == bias->dim(0),
"The shape of Input: ",
MakeString(input->shape()),
"The shape of Weight: ",
MakeString(weight->shape()),
" and Bias ",
bias->dim(0),
" don't match.");
MACE_CHECK(
input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) &&
input->dim(3) == weight->dim(1) && weight->dim(0) == bias->dim(0),
"The shape of Input: ", MakeString(input->shape()),
"The shape of Weight: ", MakeString(weight->shape()), " and Bias ",
bias->dim(0), " don't match.");
}
return functor_(input, weight, bias, output, future);
......
......@@ -24,7 +24,7 @@ namespace test {
class FullyConnectedOpTest : public OpsTestBase {};
namespace {
template<DeviceType D>
template <DeviceType D>
void Simple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_value,
const std::vector<index_t> &weight_shape,
......@@ -42,11 +42,11 @@ void Simple(const std::vector<index_t> &input_shape,
if (D == DeviceType::CPU) {
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Weight")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
......@@ -59,11 +59,11 @@ void Simple(const std::vector<index_t> &input_shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -86,14 +86,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPU) {
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
Simple<DeviceType::CPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 1, 2, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 1, 2, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::CPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 1, 2, 3},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 1, 2, 3},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
}
TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
......@@ -103,26 +103,26 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 2, 2},
{1, 3, 5, 7, 2, 4, 6, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
{1, 3, 5, 7, 2, 4, 6, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
Simple<DeviceType::GPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 5, 1, 2},
{1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 10, 60, 20, 70, 30, 80, 40, 90, 50, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 5, 1, 2},
{1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 10, 60, 20, 70, 30, 80, 40, 90, 50, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::GPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2},
{1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6,
10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2},
{1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2,
5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
}
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 1, 2},
{1, 3, 2, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
{1, 3, 2, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
}
namespace {
template<typename T>
template <typename T>
void Random(const index_t batch,
const index_t height,
const index_t width,
......@@ -134,22 +134,20 @@ void Random(const index_t batch,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, channels, height, width});
"Weight", {out_channel, channels, height, width});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW")
.Input("Weight")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Input("Weight")
.Input("Bias")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// run cpu
net.RunOp();
......@@ -169,12 +167,12 @@ void Random(const index_t batch,
kernels::BufferType::ARGUMENT);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::GPU);
......@@ -182,11 +180,11 @@ void Random(const index_t batch,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-1);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1,
1e-1);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-3);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-3);
}
}
} // namespace
......
......@@ -25,8 +25,7 @@ template <DeviceType D, class T>
class LocalResponseNormOp : public Operator<D, T> {
public:
LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
functor_() {
: Operator<D, T>(operator_def, ws), functor_() {
depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
......@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> {
input->dim_size());
Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
}
......
......@@ -21,7 +21,7 @@ namespace test {
class LocalResponseNormOpTest : public OpsTestBase {};
template<DeviceType D>
template <DeviceType D>
void Simple() {
OpsTestNet net;
......@@ -33,22 +33,22 @@ void Simple() {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
.Input("InputNCHW")
.AddIntArg("depth_radius", 5)
.AddFloatArg("bias", 1.0f)
.AddFloatArg("alpha", 1.0f)
.AddFloatArg("beta", 0.5f)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.AddIntArg("depth_radius", 5)
.AddFloatArg("bias", 1.0f)
.AddFloatArg("alpha", 1.0f)
.AddFloatArg("beta", 0.5f)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
}
// Check
auto expected =
CreateTensor<float>({1, 1, 2, 6}, {0.28, 0.28, 0.39, 0.39, 0.51, 0.51,
0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
auto expected = CreateTensor<float>(
{1, 1, 2, 6},
{0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0, 1e-2);
}
......
......@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
{22, 28, 49, 64});
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::GPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
......@@ -127,10 +126,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("A",
{batch, height, channels, 1});
net.AddRandomInput<DeviceType::GPU, float>(
"B", {batch, channels, out_width, 1});
net.AddRandomInput<DeviceType::GPU, float>("A", {batch, height, channels, 1});
net.AddRandomInput<DeviceType::GPU, float>("B",
{batch, channels, out_width, 1});
// run cpu
net.RunOp();
......@@ -141,9 +139,9 @@ void Complex(const index_t batch,
// Run on opencl
BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest")
.Input("AImage")
......@@ -156,13 +154,13 @@ void Complex(const index_t batch,
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT);
kernels::BufferType::IN_OUT_HEIGHT);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-5);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5,
1e-5);
}
}
} // namespace
......
......@@ -112,12 +112,12 @@ class OpsTestNet {
public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void AddInputFromArray(const std::string &name,
const std::vector<index_t> &shape,
const std::vector<T> &data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
......@@ -125,25 +125,25 @@ class OpsTestNet {
memcpy(input_data, data.data(), data.size() * sizeof(T));
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void AddRepeatedInput(const std::string &name,
const std::vector<index_t> &shape,
const T data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), data);
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void AddRandomInput(const std::string &name,
const std::vector<index_t> &shape,
bool positive = true,
bool truncate = false) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
......@@ -153,15 +153,15 @@ class OpsTestNet {
std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(
input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] {
float d = nd(gen);
if (truncate) {
if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f;
}
return half_float::half_cast<half>(positive ?std::abs(d) : d);
});
input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] {
float d = nd(gen);
if (truncate) {
if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f;
}
return half_float::half_cast<half>(positive ? std::abs(d) : d);
});
} else {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] {
......@@ -170,17 +170,15 @@ class OpsTestNet {
if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f;
}
return (positive ?std::abs(d) : d);
return (positive ? std::abs(d) : d);
});
}
}
template<DeviceType D, typename T>
void Transpose2D(const std::string &src_name,
const std::string &dst_name) {
template <DeviceType D, typename T>
void Transpose2D(const std::string &src_name, const std::string &dst_name) {
Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name,
GetDeviceAllocator(D),
Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 2, "input shape != 2");
......@@ -192,19 +190,18 @@ class OpsTestNet {
for (index_t i = 0; i < input_shape[0]; ++i) {
for (index_t j = 0; j < input_shape[1]; ++j) {
output_data[j * input_shape[0] + i] =
input_data[i * input_shape[1] + j];
input_data[i * input_shape[1] + j];
}
}
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TransformDataFormat(const std::string &src_name,
const DataFormat src_format,
const std::string &dst_name,
const DataFormat dst_format) {
Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name,
GetDeviceAllocator(D),
Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4, "input shape != 4");
......@@ -224,7 +221,7 @@ class OpsTestNet {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
output_data[((b * channels + c) * height + h) * width + w] =
input_data[((b * height + h) * width + w) * channels + c];
input_data[((b * height + h) * width + w) * channels + c];
}
}
}
......@@ -244,7 +241,7 @@ class OpsTestNet {
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) {
output_data[((b * height + h) * width + w) * channels + c] =
input_data[((b * channels + c) * height + h) * width + w];
input_data[((b * channels + c) * height + h) * width + w];
}
}
}
......@@ -264,7 +261,7 @@ class OpsTestNet {
for (index_t i = 0; i < oi; ++i) {
for (index_t j = 0; j < hw; ++j) {
output_data[i * height * width + j] =
input_data[j * out_channels * in_channels + i];
input_data[j * out_channels * in_channels + i];
}
}
} else if (src_format == OIHW && dst_format == HWOI) {
......@@ -282,7 +279,7 @@ class OpsTestNet {
for (index_t i = 0; i < hw; ++i) {
for (index_t j = 0; j < oi; ++j) {
output_data[i * out_channels * in_channels + j] =
input_data[j * height * width + i];
input_data[j * height * width + i];
}
}
} else if (src_format == HWIO && dst_format == OIHW) {
......@@ -300,7 +297,8 @@ class OpsTestNet {
for (index_t c = 0; c < in_channels; ++c) {
for (index_t k = 0; k < hw; ++k) {
output_data[((m * in_channels) + c) * height * width + k] =
input_data[k * out_channels * in_channels + c * out_channels + m];
input_data[k * out_channels * in_channels + c * out_channels +
m];
}
}
}
......@@ -309,12 +307,11 @@ class OpsTestNet {
}
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void FillNHWCInputToNCHWInput(const std::string &name_nchw,
const std::string &name_nhwc) {
Tensor *input = ws_.GetTensor(name_nhwc);
Tensor *output = ws_.CreateTensor(name_nchw,
GetDeviceAllocator(D),
Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
index_t batch = input_shape[0];
......@@ -329,7 +326,7 @@ class OpsTestNet {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
output_data[((b * channels + c) * height + h) * width + w] =
input_data[((b * height + h) * width + w) * channels + c];
input_data[((b * height + h) * width + w) * channels + c];
}
}
}
......@@ -370,14 +367,12 @@ class OpsTestNet {
// DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark.
MaceStatus RunOp() {
return RunOp(DeviceType::CPU);
}
MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
device_ = device;
net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
MACE_FAILURE_RETURN(net_->Run());
MACE_RETURN_IF_ERROR(net_->Run());
net_ = CreateNet(op_registry_, net_def, &ws_, device);
return net_->Run();
}
......@@ -415,7 +410,7 @@ class OpsTestBase : public ::testing::Test {
}
};
template<typename T>
template <typename T>
void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> *res,
bool positive = true) {
......@@ -430,11 +425,10 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
res->resize(size);
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res->begin(), res->end(),
[&gen, &nd, positive] {
return half_float::half_cast<half>(
positive ? std::abs(nd(gen)) : nd(gen));
});
std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
return half_float::half_cast<half>(positive ? std::abs(nd(gen))
: nd(gen));
});
} else {
std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
return positive ? std::abs(nd(gen)) : nd(gen);
......@@ -442,7 +436,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
}
}
template<typename T>
template <typename T>
void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::vector<T> *res,
const T a = 0,
......@@ -460,7 +454,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); });
}
template<typename T>
template <typename T>
std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
std::vector<T> dest;
dest.reserve(src.size());
......@@ -470,11 +464,11 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
return std::move(dest);
}
template<typename T>
template <typename T>
std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) {
std::unique_ptr<Tensor> res(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
......@@ -504,24 +498,24 @@ inline std::string ShapeToString(const Tensor &x) {
return std::string(stream.str());
}
template<typename T>
template <typename T>
struct is_floating_point_type {
static const bool value = std::is_same<T, float>::value ||
std::is_same<T, double>::value ||
std::is_same<T, half>::value;
std::is_same<T, double>::value ||
std::is_same<T, half>::value;
};
template<typename T>
template <typename T>
inline void ExpectEqual(const T &a, const T &b) {
EXPECT_EQ(a, b);
}
template<>
template <>
inline void ExpectEqual<float>(const float &a, const float &b) {
EXPECT_FLOAT_EQ(a, b);
}
template<>
template <>
inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b);
}
......@@ -531,13 +525,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) {
<< "y.shape [ " << ShapeToString(y) << "]";
}
template<typename EXP_TYPE,
typename RES_TYPE,
bool is_fp = is_floating_point_type<EXP_TYPE>::value>
template <typename EXP_TYPE,
typename RES_TYPE,
bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector;
// Partial specialization for float and double.
template<typename EXP_TYPE, typename RES_TYPE>
template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
......@@ -554,7 +548,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
}
}
static void Near(const Tensor &x, const Tensor &y,
static void Near(const Tensor &x,
const Tensor &y,
const double rel_err,
const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
......@@ -588,7 +583,7 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
}
};
template<typename EXP_TYPE, typename RES_TYPE>
template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, false> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
......@@ -605,7 +600,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
}
}
static void Near(const Tensor &x, const Tensor &y,
static void Near(const Tensor &x,
const Tensor &y,
const double rel_err,
const double abs_err) {
MACE_UNUSED(rel_err);
......@@ -614,21 +610,23 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
}
};
template<typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y,
template <typename T>
void ExpectTensorNear(const Tensor &x,
const Tensor &y,
const double rel_err = 1e-5,
const double abs_err = 1e-8) {
Expector<T, T>::Near(x, y, rel_err, abs_err);
}
template<typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x, const Tensor &y,
template <typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x,
const Tensor &y,
const double rel_err = 1e-5,
const double abs_err = 1e-8) {
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, rel_err, abs_err);
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void BufferToImage(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
......@@ -636,11 +634,11 @@ void BufferToImage(OpsTestNet *net,
MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......@@ -648,7 +646,7 @@ void BufferToImage(OpsTestNet *net,
net->Sync();
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
......@@ -656,11 +654,11 @@ void ImageToBuffer(OpsTestNet *net,
MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......
......@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> {
PadOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
functor_(OperatorBase::GetRepeatedArgs<int>("paddings"),
OperatorBase::GetOptionalArg<float>("constant_value", 0.0))
{}
OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0);
......
......@@ -45,9 +45,7 @@ void Simple() {
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
......@@ -59,33 +57,25 @@ void Simple() {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
}
auto output = net.GetTensor("Output");
auto expected = CreateTensor<float>({1, 5, 6, 1},
{
1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 2, 2, 2, 1.0, 1.0,
1.0, 2, 2, 2, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
});
auto expected = CreateTensor<float>(
{1, 5, 6, 1}, {
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2,
1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
});
ExpectTensorNear<float>(*expected, *output, 1e-5);
}
} // namespace
TEST_F(PadTest, SimpleCPU) {
Simple<DeviceType::CPU>();
}
TEST_F(PadTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(PadTest, SimpleGPU) {
Simple<DeviceType::GPU>();
}
TEST_F(PadTest, SimpleGPU) { Simple<DeviceType::GPU>(); }
TEST_F(PadTest, ComplexCPU) {
// Construct graph
......@@ -93,9 +83,7 @@ TEST_F(PadTest, ComplexCPU) {
// Add input data
net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
......@@ -106,9 +94,7 @@ TEST_F(PadTest, ComplexCPU) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
auto output = net.GetTensor("Output");
......@@ -134,9 +120,7 @@ void Complex(const std::vector<index_t> &input_shape,
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
......@@ -147,16 +131,14 @@ void Complex(const std::vector<index_t> &input_shape,
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pad", "PadTest")
.Input("InputImage")
.Output("OutputImage")
......@@ -168,7 +150,7 @@ void Complex(const std::vector<index_t> &input_shape,
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
auto output = net.GetTensor("OpenCLOutput");
......@@ -181,24 +163,23 @@ void Complex(const std::vector<index_t> &input_shape,
} // namespace
TEST_F(PadTest, ComplexFloat) {
Complex<float>({1, 32, 32, 4},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<float>({1, 31, 37, 16},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<float>({1, 128, 128, 32},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
Complex<float>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
{0, 0, 2, 2, 1, 1, 0, 0});
Complex<float>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
{0, 0, 2, 0, 1, 0, 0, 0});
Complex<float>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
{0, 0, 0, 1, 0, 2, 0, 0});
}
TEST_F(PadTest, ComplexHalf) {
Complex<half>({1, 32, 32, 4},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<half>({1, 31, 37, 16},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<half>({1, 128, 128, 32},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
Complex<half>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
{0, 0, 2, 2, 1, 1, 0, 0});
Complex<half>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
{0, 0, 2, 0, 1, 0, 0, 0});
Complex<half>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
{0, 0, 0, 1, 0, 2, 0, 0});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -31,36 +31,32 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -73,27 +69,23 @@ TEST_F(PoolingOpTest, MAX_SAME) {
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -108,30 +100,26 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {2, 2})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {2, 2})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -146,31 +134,26 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -180,49 +163,45 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
}
namespace {
template<DeviceType D>
template <DeviceType D>
void SimpleMaxPooling3S2() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
"Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -237,12 +216,10 @@ void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<GPU>();
}
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2<GPU>(); }
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides,
Padding padding) {
......@@ -252,27 +229,23 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
Tensor expected;
......@@ -281,22 +254,22 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-3, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-4);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
......@@ -330,63 +303,58 @@ TEST_F(PoolingOpTest, AVG_VALID) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
namespace {
template<DeviceType D>
template <DeviceType D>
void SimpleAvgPoolingTest() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
......@@ -399,12 +367,10 @@ void SimpleAvgPoolingTest() {
}
} // namespace
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<GPU>();
}
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest<GPU>(); }
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
......@@ -415,27 +381,23 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
Tensor expected;
......@@ -444,68 +406,60 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-3, 1e-3);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-3);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
}
} // namespace
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::VALID);
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::SAME);
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::VALID);
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID);
Padding::VALID);
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME);
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID);
Padding::VALID);
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME);
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
Padding::VALID);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
Padding::SAME);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::VALID);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::SAME);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
}
} // namespace test
......
......@@ -45,17 +45,17 @@ TEST_F(ProposalOpTest, CPUSimple) {
.Finalize(net.NewOperatorDef());
std::vector<float> scores(height * width * 18);
for (size_t i = 0 ; i < scores.size(); ++i) {
for (size_t i = 0; i < scores.size(); ++i) {
scores[i] = i;
}
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"RpnCLSProb", {1, height, width, 18}, scores);
net.AddRepeatedInput<DeviceType::CPU, float>(
"RpnBBoxPred", {1, height, width, 4 * 9}, 1);
net.AddInputFromArray<DeviceType::CPU, float>(
"ImgInfo", {1, 1, 1, 3}, {img_height, img_width, 2});
net.AddInputFromArray<DeviceType::CPU, float>("RpnCLSProb",
{1, height, width, 18}, scores);
net.AddRepeatedInput<DeviceType::CPU, float>("RpnBBoxPred",
{1, height, width, 4 * 9}, 1);
net.AddInputFromArray<DeviceType::CPU, float>("ImgInfo", {1, 1, 1, 3},
{img_height, img_width, 2});
// Run
net.RunOp();
......@@ -65,7 +65,6 @@ TEST_F(ProposalOpTest, CPUSimple) {
ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -35,9 +35,9 @@ void Register_Dequantize(OperatorRegistry *op_registry) {
void Register_Requantize(OperatorRegistry *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
RequantizeOp<DeviceType::CPU, uint8_t>);
}
......
......@@ -21,12 +21,11 @@
namespace mace {
namespace ops {
template<DeviceType D, class T>
template <DeviceType D, class T>
class QuantizeOp : public Operator<D, T> {
public:
QuantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
}
: Operator<D, T>(operator_def, ws) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......@@ -39,9 +38,9 @@ class QuantizeOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
Tensor *out_min = this->Output(OUT_MIN);
Tensor *out_max = this->Output(OUT_MAX);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_FAILURE_RETURN(out_min->ResizeLike(in_min));
MACE_FAILURE_RETURN(out_max->ResizeLike(in_max));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(out_min->ResizeLike(in_min));
MACE_RETURN_IF_ERROR(out_max->ResizeLike(in_max));
return functor_(input, in_min, in_max, output, out_min, out_max, future);
}
......@@ -54,12 +53,11 @@ class QuantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
};
template<DeviceType D, class T>
template <DeviceType D, class T>
class DequantizeOp : public Operator<D, T> {
public:
DequantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
}
: Operator<D, T>(operator_def, ws) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......@@ -70,7 +68,7 @@ class DequantizeOp : public Operator<D, T> {
MACE_CHECK(in_max->size() == 1, "max val tensor has more than 1 value");
Tensor *output = this->Output(OUTPUT);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, in_min, in_max, output, future);
}
......@@ -83,12 +81,11 @@ class DequantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
template<DeviceType D, class T>
template <DeviceType D, class T>
class RequantizeOp : public Operator<D, T> {
public:
RequantizeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
}
: Operator<D, T>(operator_def, ws) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......@@ -112,19 +109,12 @@ class RequantizeOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
Tensor *out_min = this->Output(OUT_MIN);
Tensor *out_max = this->Output(OUT_MAX);
MACE_FAILURE_RETURN(output->ResizeLike(input));
MACE_FAILURE_RETURN(out_min->ResizeLike(in_min));
MACE_FAILURE_RETURN(out_max->ResizeLike(out_max));
return functor_(input,
in_min,
in_max,
rerange_min,
rerange_max,
output,
out_min,
out_max,
future);
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
MACE_RETURN_IF_ERROR(out_min->ResizeLike(in_min));
MACE_RETURN_IF_ERROR(out_max->ResizeLike(out_max));
return functor_(input, in_min, in_max, rerange_min, rerange_max, output,
out_min, out_max, future);
}
private:
......
......@@ -26,22 +26,21 @@ TEST_F(QuantizeTest, TestQuantize) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<CPU, float>("Input", {1, 2, 3, 1}, {
-2, -1, 1, 2, 3, 4
});
net.AddInputFromArray<CPU, float>("Input", {1, 2, 3, 1},
{-2, -1, 1, 2, 3, 4});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Quantize", "QuantizeTest")
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
......@@ -50,10 +49,8 @@ TEST_F(QuantizeTest, TestQuantize) {
auto output_min = net.GetTensor("OutputMin");
auto output_max = net.GetTensor("OutputMax");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1},
{
32, 64, 127, 159, 191, 223
});
auto expected_output =
CreateTensor<uint8_t>({1, 2, 3, 1}, {32, 64, 127, 159, 191, 223});
auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5});
......@@ -69,27 +66,25 @@ TEST_F(QuantizeTest, TestQuantizeTrend) {
// Add input data
net.AddRandomInput<CPU, float>("Input", {100});
const float *input_data = net.GetTensor("Input")->data<float>();
net.AddInputFromArray<CPU, float>("InputMin",
{1},
{*std::min_element(input_data,
input_data
+ net.GetTensor("Input")->size())});
net.AddInputFromArray<CPU, float>("InputMax",
{1},
{*std::max_element(input_data,
input_data
+ net.GetTensor("Input")->size())});
net.AddInputFromArray<CPU, float>(
"InputMin", {1},
{*std::min_element(input_data,
input_data + net.GetTensor("Input")->size())});
net.AddInputFromArray<CPU, float>(
"InputMax", {1},
{*std::max_element(input_data,
input_data + net.GetTensor("Input")->size())});
OpDefBuilder("Quantize", "QuantizeTest")
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
......@@ -113,29 +108,26 @@ TEST_F(QuantizeTest, TestDequantize) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<CPU, uint8_t>("Input", {1, 2, 3, 1}, {
32, 64, 127, 159, 191, 223
});
net.AddInputFromArray<CPU, uint8_t>("Input", {1, 2, 3, 1},
{32, 64, 127, 159, 191, 223});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3.01887});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Dequantize", "DequantizeTest")
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<float>({1, 2, 3, 1},
{
-2, -1, 1, 2, 3, 4
});
auto expected_output =
CreateTensor<float>({1, 2, 3, 1}, {-2, -1, 1, 2, 3, 4});
auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5});
......@@ -147,35 +139,33 @@ TEST_F(QuantizeTest, TestRequantizeWithMinMax) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<CPU, int>("Input", {1, 2, 3, 1}, {
-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647
});
net.AddInputFromArray<CPU, int>(
"Input", {1, 2, 3, 1},
{-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
net.AddInputFromArray<CPU, float>("RerangeMin", {1}, {-3.01887});
net.AddInputFromArray<CPU, float>("RerangeMax", {1}, {5});
OpDefBuilder("Requantize", "RequantizeTest")
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Input("RerangeMin")
.Input("RerangeMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Input("RerangeMin")
.Input("RerangeMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1},
{
32, 64, 128, 160, 191, 223
});
auto expected_output =
CreateTensor<uint8_t>({1, 2, 3, 1}, {32, 64, 128, 160, 191, 223});
auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5});
......@@ -187,31 +177,29 @@ TEST_F(QuantizeTest, TestRequantizeWithoutMinMax) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<CPU, int>("Input", {1, 2, 3, 1}, {
-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647
});
net.AddInputFromArray<CPU, int>(
"Input", {1, 2, 3, 1},
{-1073741824, -536870912, 536870912, 1073741824, 1610612736, 2147483647});
net.AddInputFromArray<CPU, float>("InputMin", {1}, {-3});
net.AddInputFromArray<CPU, float>("InputMax", {1}, {5});
OpDefBuilder("Requantize", "RequantizeTest")
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("InputMin")
.Input("InputMax")
.Output("Output")
.Output("OutputMin")
.Output("OutputMax")
.OutputType({DT_UINT8, DT_FLOAT, DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
auto output = net.GetTensor("Output");
auto expected_output = CreateTensor<uint8_t>({1, 2, 3, 1},
{
0, 43, 128, 170, 213, 255
});
auto expected_output =
CreateTensor<uint8_t>({1, 2, 3, 1}, {0, 43, 128, 170, 213, 255});
auto expected_min = CreateTensor<float>({1}, {-3.01887});
auto expected_max = CreateTensor<float>({1}, {5});
ExpectTensorNear<uint8_t>(*expected_output, *output);
......
......@@ -26,9 +26,8 @@ class ResizeBilinearOp : public Operator<D, T> {
public:
ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
functor_(
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(0);
......
......@@ -15,8 +15,8 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/resize_bilinear.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/resize_bilinear.h"
namespace mace {
namespace ops {
......@@ -33,22 +33,18 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
......@@ -66,26 +62,21 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("align_corners", 1)
.AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("align_corners", 1)
.AddIntsArg("size", {1, 2})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
// Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
......@@ -111,9 +102,7 @@ void TestRandomResizeBilinear() {
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, in_height, in_width, channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
......@@ -124,10 +113,8 @@ void TestRandomResizeBilinear() {
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
......@@ -149,8 +136,8 @@ void TestRandomResizeBilinear() {
kernels::BufferType::IN_OUT_CHANNEL);
}
// Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"),
1e-5, 1e-6);
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-6);
}
}
} // namespace
......
......@@ -32,12 +32,12 @@ class SliceOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override {
MACE_CHECK(this->OutputSize() >= 2)
<< "There must be at least two outputs for slicing";
<< "There must be at least two outputs for slicing";
const Tensor *input = this->Input(INPUT);
const std::vector<Tensor *> output_list = this->Outputs();
const int32_t slice_axis = OperatorBase::GetOptionalArg<int>("axis", 3);
MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
<< "Outputs do not split input equally.";
<< "Outputs do not split input equally.";
return functor_(input, output_list, future);
}
......
......@@ -16,8 +16,8 @@
#include <vector>
#include "gmock/gmock.h"
#include "mace/ops/slice.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/slice.h"
namespace mace {
namespace ops {
......@@ -26,7 +26,7 @@ namespace test {
class SliceOpTest : public OpsTestBase {};
namespace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void RandomTest(const int num_outputs, const int axis) {
static unsigned int seed = time(NULL);
const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
......@@ -43,10 +43,8 @@ void RandomTest(const int num_outputs, const int axis) {
input_shape = {batch, input_channels, height, width};
else if (axis == 3)
input_shape = {batch, height, width, input_channels};
const index_t input_size = std::accumulate(input_shape.begin(),
input_shape.end(),
1,
std::multiplies<index_t>());
const index_t input_size = std::accumulate(
input_shape.begin(), input_shape.end(), 1, std::multiplies<index_t>());
std::vector<float> input_data(input_size);
GenerateRandomRealTypeData(input_shape, &input_data);
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
......@@ -60,8 +58,7 @@ void RandomTest(const int num_outputs, const int axis) {
for (int i = 0; i < num_outputs; ++i) {
builder = builder.Output(MakeString("OutputImage", i));
}
builder
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis);
......@@ -77,8 +74,7 @@ void RandomTest(const int num_outputs, const int axis) {
if (D == DeviceType::GPU) {
for (int i = 0; i < num_outputs; ++i) {
ImageToBuffer<D, float>(&net,
MakeString("OutputImage", i),
ImageToBuffer<D, float>(&net, MakeString("OutputImage", i),
MakeString("Output", i),
kernels::BufferType::IN_OUT_CHANNEL);
}
......@@ -90,14 +86,12 @@ void RandomTest(const int num_outputs, const int axis) {
expected_shape = {batch, output_channels, height, width};
else if (axis == 3)
expected_shape = {batch, height, width, output_channels};
const index_t outer_size = std::accumulate(expected_shape.begin(),
expected_shape.begin() + axis,
1,
std::multiplies<index_t>());
const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1,
expected_shape.end(),
1,
std::multiplies<index_t>());
const index_t outer_size =
std::accumulate(expected_shape.begin(), expected_shape.begin() + axis, 1,
std::multiplies<index_t>());
const index_t inner_size =
std::accumulate(expected_shape.begin() + axis + 1, expected_shape.end(),
1, std::multiplies<index_t>());
const float *input_ptr = input_data.data();
const float *output_ptr;
for (int i = 0; i < num_outputs; ++i) {
......@@ -106,11 +100,11 @@ void RandomTest(const int num_outputs, const int axis) {
Tensor::MappingGuard output_mapper(output);
output_ptr = output->data<float>();
for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
const int idx = (outer_idx * input_channels + i * output_channels)
* inner_size;
const int idx =
(outer_idx * input_channels + i * output_channels) * inner_size;
for (int j = 0; j < output_channels * inner_size; ++j) {
ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output "
<< i << " index " << idx + j;
ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2)
<< "with output " << i << " index " << idx + j;
}
}
}
......
......@@ -31,7 +31,7 @@ class SoftmaxOp : public Operator<D, T> {
const Tensor *logits = this->Input(LOGITS);
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(logits);
MACE_RETURN_IF_ERROR(output->ResizeLike(logits));
return functor_(logits, output, future);
}
......
......@@ -22,7 +22,7 @@ namespace test {
class SoftmaxOpTest : public OpsTestBase {};
namespace {
template<DeviceType D>
template <DeviceType D>
void Simple() {
// Construct graph
OpsTestNet net;
......@@ -33,9 +33,9 @@ void Simple() {
if (D == DeviceType::CPU) {
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -45,9 +45,9 @@ void Simple() {
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -60,8 +60,8 @@ void Simple() {
}
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -71,7 +71,7 @@ TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); }
namespace {
template<DeviceType D>
template <DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) {
// Construct graph
OpsTestNet net;
......@@ -81,9 +81,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
......@@ -97,9 +97,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on gpu
net.RunOp(D);
......@@ -108,8 +108,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
} // namespace
......
......@@ -29,16 +29,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(
OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
false) {}
functor_(OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
false) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *space_tensor = this->Input(INPUT);
Tensor *batch_tensor = this->Output(OUTPUT);
return functor_(const_cast<Tensor *>(space_tensor), batch_tensor,
future);
return functor_(const_cast<Tensor *>(space_tensor), batch_tensor, future);
}
private:
......
......@@ -35,22 +35,20 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
}
// Run
......@@ -60,10 +58,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
}
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
......@@ -83,22 +79,20 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
}
// Run
......@@ -108,10 +102,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
}
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
......@@ -124,8 +116,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
const std::vector<int> &padding_data,
const std::vector<index_t> &batch_shape,
const std::vector<float> &batch_data) {
auto space_tensor = std::unique_ptr<Tensor>(new Tensor(
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
auto space_tensor = std::unique_ptr<Tensor>(
new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
space_tensor->Resize(space_shape);
{
Tensor::MappingGuard space_mapper(space_tensor.get());
......@@ -136,8 +128,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
}
auto batch_tensor = std::unique_ptr<Tensor>(new Tensor(
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
auto batch_tensor = std::unique_ptr<Tensor>(
new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
batch_tensor->Resize(batch_shape);
{
Tensor::MappingGuard batch_mapper(batch_tensor.get());
......@@ -147,12 +139,12 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
}
RunSpaceToBatch<DeviceType::GPU>(space_shape, space_data, block_data,
padding_data, batch_tensor.get());
padding_data, batch_tensor.get());
RunSpaceToBatch<DeviceType::CPU>(space_shape, space_data, block_data,
padding_data, batch_tensor.get());
RunBatchToSpace<DeviceType::GPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get());
padding_data, space_tensor.get());
RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get());
}
......@@ -209,45 +201,41 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
}
void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) {
const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) {
OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape);
// run gpu
BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"OutputCPU",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
......@@ -255,8 +243,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
}
void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) {
const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) {
OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape);
......@@ -264,38 +252,33 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"OutputCPU",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
*net.GetOutput("OutputGPU"));
}
TEST(SpaceToBatchTest, LargeData) {
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
......
......@@ -24,20 +24,18 @@
namespace mace {
namespace ops {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class SpaceToDepthOp : public Operator<D, T> {
public:
SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {
}
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
const int block_size =
OperatorBase::GetOptionalArg<int>("block_size", 1);
const int block_size = OperatorBase::GetOptionalArg<int>("block_size", 1);
index_t input_height;
index_t input_width;
index_t input_depth;
......@@ -55,9 +53,9 @@ class SpaceToDepthOp : public Operator<D, T> {
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size == 0) && (input_height % block_size == 0),
"input width and height should be dividable by block_size",
input->dim(3));
(input_width % block_size == 0) && (input_height % block_size == 0),
"input width and height should be dividable by block_size",
input->dim(3));
return functor_(input, output, future);
}
......
......@@ -19,9 +19,9 @@ namespace ops {
void Register_Transpose(OperatorRegistry *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
TransposeOp<DeviceType::CPU, float>);
}
......
......@@ -18,31 +18,31 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/transpose.h"
#include "mace/kernels/softmax.h"
#include "mace/kernels/transpose.h"
namespace mace {
template<DeviceType D, class T>
template <DeviceType D, class T>
class TransposeOp : public Operator<D, T> {
public:
TransposeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
functor_(dims_) {}
: Operator<D, T>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
functor_(dims_) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
const std::vector<index_t> &input_shape = input->shape();
MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4)
|| (input_shape.size() == 2 && dims_.size() == 2),
MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) ||
(input_shape.size() == 2 && dims_.size() == 2),
"rank should be 2 or 4");
std::vector<index_t> output_shape;
for (size_t i = 0; i < dims_.size(); ++i) {
output_shape.push_back(input_shape[dims_[i]]);
}
MACE_FAILURE_RETURN(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
return functor_(input, output, future);
}
......
......@@ -29,18 +29,16 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
net.AddRandomInput<CPU, float>("Input", input_shape);
OpDefBuilder("Transpose", "TransposeNCHWTest")
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {0, 3, 1, 2})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {0, 3, 1, 2})
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("Input",
DataFormat::NHWC,
"InputNCHW",
DataFormat::NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
ExpectTensorNear<float>(*net.GetOutput("InputNCHW"),
*net.GetOutput("Output"));
......@@ -53,18 +51,16 @@ void TransposeNHWCTest(const std::vector<index_t> &input_shape) {
net.AddRandomInput<CPU, float>("Input", input_shape);
OpDefBuilder("Transpose", "TransposeNHWCTest")
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {0, 2, 3, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {0, 2, 3, 1})
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("Input",
DataFormat::NCHW,
"InputNHWC",
DataFormat::NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NCHW, "InputNHWC", DataFormat::NHWC);
ExpectTensorNear<float>(*net.GetOutput("InputNHWC"),
*net.GetOutput("Output"));
......@@ -91,16 +87,15 @@ TEST_F(TransposeOpTest, Rank2) {
net.AddInputFromArray<CPU, float>("Input", {2, 3}, {1, 2, 3, 4, 5, 6});
OpDefBuilder("Transpose", "TransposeNCHWTest")
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {1, 0})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {1, 0})
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
net.AddInputFromArray<CPU, float>("ExpectedOutput",
{3, 2},
net.AddInputFromArray<CPU, float>("ExpectedOutput", {3, 2},
{1, 4, 2, 5, 3, 6});
ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
......
......@@ -83,7 +83,7 @@ void WinogradConvolution(const index_t batch,
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape();
......@@ -132,34 +132,29 @@ void WinogradConvolution(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"),
1e-2, 1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"),
1e-5, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
}
}
} // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::SAME);
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME);
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME);
}
TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME);
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME);
}
namespace {
......@@ -199,7 +194,7 @@ void WinogradConvolutionWithPad(const index_t batch,
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape();
......@@ -248,34 +243,26 @@ void WinogradConvolutionWithPad(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"),
1e-2, 1e-2);
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"),
1e-5, 1e-4);
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4);
}
}
} // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2);
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2);
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2);
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2);
}
} // namespace test
......
......@@ -65,7 +65,7 @@ enum MaceStatus {
MACE_OUT_OF_RESOURCES = 2
};
#define MACE_FAILURE_RETURN(stmt) \
#define MACE_RETURN_IF_ERROR(stmt) \
{ \
MaceStatus status = (stmt); \
if (status != MACE_SUCCESS) { \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册