提交 52cc0540 编写于 作者: L liuqi

Finish conv 3x3 with stride 1 and 2.

上级 0461beb5
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width);
void kernel conv_2d_3x3(global const float *input,
global const float *filter,
global const float *bias,
global float *output,
private const uint in_chan_num,
private const uint out_chan_num,
private const uint in_height,
private const uint in_width,
private const uint out_height,
private const uint out_width,
private const uint stride_h,
private const uint stride_w) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const uint in_pixel = in_height * in_width;
const uint out_pixel = out_height * out_width;
const uint round_out_width = (out_width + 3) / 4;
const uint out_pixel_height = out_pixel_blk / round_out_width;
const uint out_pixel_width = out_pixel_blk % round_out_width;
const uint out_chan_begin = out_chan_blk * 4;
const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const uint in_offset = batch * in_chan_num * in_pixel;
const uint out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
uint pixels = out_pixel_end - out_pixel_begin;
for (uint i = out_chan_begin; i < out_chan_end; ++i) {
float4 res = (float4)bias[i];
float *output_ptr = output_base + i * out_pixel;
const float *filter_base = filter + i * in_chan_num * 9;
if (pixels == 4) {
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel;
const float* filter_ptr = filter_base + in_chan_idx * 9;
if (stride_w == 1) {
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} else {
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
}
}
vstore4(res, 0, output_ptr);
} else {
for (uint p = 0; p < pixels; ++p) {
float res = bias[i];
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
const float* filter_ptr = filter_base + in_chan_idx * 9;
res += conv3x3(input_ptr, filter_ptr, in_width);
}
output_ptr[p] = res;
}
}
}
}
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr) {
float4 row0 = vload4(0, input_ptr);
float2 input1 = vload2(0, input_ptr+4);
float4 row1 = (float4)(row0.s123, input1.s0);
float4 row2 = (float4)(row0.s23, input1.s01);
float3 filter_values = vload3(0, filter_ptr);
return (float4)filter_values.s0 * row0 +
(float4)filter_values.s1 * row1 +
(float4)filter_values.s2 * row2;
}
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr) {
float8 input = vload8(0, input_ptr);
float4 row0 = input.even;
float4 row1 = input.odd;
float4 row2 = (float4)(row0.s123, input_ptr[8]);
float3 filter_values = vload3(0, filter_ptr);
return (float4)filter_values.s0 * row0 +
(float4)filter_values.s1 * row1 +
(float4)filter_values.s2 * row2;
}
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width) {
float3 input_value = vload3(0, input_ptr);
float3 filter_value = vload3(0, filter_ptr);
float3 res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
inline float4 conv1x3(const float *input_ptr,
const float *filter_ptr) {
float8 input = vload8(0, input_ptr);
float4 row0 = convert_float4(input.s0123);
float4 row1 = convert_float4(input.s1234);
float4 row2 = convert_float4(input.s2345);
return (float4)filter_ptr[0] * row0 + (float4)filter_ptr[1] * row1
+ (float4)filter_ptr[2] * row2;
}
inline float4 conv3x3x4(const float *input_ptr,
const float *filter_ptr,
const int row_width) {
float4 res;
res = conv1x3(input_ptr + 0 * row_width, filter_ptr + 0 * 3);
res += conv1x3(input_ptr + 1 * row_width, filter_ptr + 1 * 3);
res += conv1x3(input_ptr + 2 * row_width, filter_ptr + 2 * 3);
return res;
}
inline float conv3x3(const float *input_ptr,
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width) {
float res = input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
input_ptr += row_width;
filter_ptr += 3;
res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
input_ptr += row_width;
filter_ptr += 3;
res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
return res;
}
const int row_width);
void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
global const float *filter, /* m, i, kh, kw */
......@@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
input_ptr += 1;
}
} else {
float4 res = conv3x3x4(input_ptr, filter_ptr, in_width);
res += (float4)bias_value;
float4 res = (float4)bias_value;
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
vstore4(res, 0, output_ptr);
}
}
......
......@@ -3,7 +3,6 @@
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
......@@ -11,6 +10,11 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
......@@ -22,7 +26,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, nullptr},
{nullptr, nullptr},
{nullptr, nullptr},
{Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
......@@ -40,11 +44,16 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input, filter, bias, output);
return;
}
MACE_CHECK(paddings_[0] == 0 && paddings_[1] == 0, "Padding not supported");
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, output);
if (paddings_[0] > 0 || paddings_[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
Tensor::MappingGuard input_mapper(input);
ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
&padded_input);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, output);
}
}
} // namespace kernels
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
namespace mace {
namespace kernels {
static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride, Tensor *output) {
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
MACE_CHECK(input->dim(0) == output->dim(0));
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
bm_kernel.setArg(idx++, stride);
bm_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 1, output);
};
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 2, output);
};
} // namespace kernels
} // namespace mace
......@@ -3,7 +3,6 @@
//
#include <algorithm>
#include <sstream>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
......@@ -14,7 +13,6 @@ namespace mace {
template <DeviceType D, typename T>
static void Conv2d(int iters,
int iters_to_sync,
int batch,
int channels,
int height,
......@@ -32,37 +30,32 @@ static void Conv2d(int iters,
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Finalize(net.operator_def());
// Add args
net.AddIntsArg("strides", {stride, stride});
net.AddIntArg("padding", padding);
net.AddIntsArg("dilations", {1, 1});
// Add input data
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h, kernel_w});
{output_channels, channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Sync();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
if (iters % iters_to_sync == 0) {
net.Sync();
}
}
net.Sync();
}
// In common network, there are usually more than 1 layers, this is used to
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
constexpr int kItersToSync = 10;
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
......@@ -70,8 +63,8 @@ constexpr int kItersToSync = 10;
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
......
......@@ -3,16 +3,15 @@
//
#include "mace/ops/conv_2d.h"
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
class Conv2dOpTest : public OpsTestBase {};
TEST_F(Conv2dOpTest, Simple_VALID) {
// Construct graph
auto &net = test_net();
template <DeviceType D>
void TestSimple3x3VALID() {
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
......@@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
// Add args
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Input", {1, 2, 3, 3},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
// Run
net.RunOp();
net.RunOp(D);
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Simple_SAME) {
// Construct graph
auto &net = test_net();
template <DeviceType D>
void TestSimple3x3SAME() {
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
......@@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Input", {1, 2, 3, 3},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
// Run
net.RunOp();
net.RunOp(D);
// Check
auto expected = CreateTensor<float>(
......@@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Combined) {
TEST_F(Conv2dOpTest, CPUSimple) {
TestSimple3x3VALID<DeviceType::CPU>();
TestSimple3x3SAME<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3VALID<DeviceType::NEON>();
TestSimple3x3SAME<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLSimple) {
TestSimple3x3VALID<DeviceType::OPENCL>();
TestSimple3x3SAME<DeviceType::OPENCL>();
}
template <DeviceType D>
static void TestCombined3x3() {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
......@@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Input", {1, 2, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Filter", {2, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
net.AddInputFromArray<DeviceType::CPU, float>("Bias", {2}, {0.1f, 0.2f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Run
net.RunOp();
net.RunOp(D);
// Check
auto expected = CreateTensor<float>(
......@@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) {
4.2f, 6.2f, 4.2f, 6.2f, 9.2f, 6.2f, 4.2f, 6.2f, 4.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUCombined) {
TestCombined3x3<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLCombined) {
TestCombined3x3<DeviceType::OPENCL>();
}
template <DeviceType D>
......@@ -159,13 +188,16 @@ void TestConv1x1() {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Conv1x1) {
TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
// TODO we need more tests
TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
template <DeviceType D>
static void TestAlignedConvNxNS12() {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
......@@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
index_t width = 32;
index_t output_channels = 128;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
......@@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<DeviceType::CPU, float>(
net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
// run cpu
net.RunOp();
net.AddRandomInput<D, float>("Bias", {output_channels});
// Run on device
net.RunOp(D);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
};
......@@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
}
}
TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
TEST_F(Conv2dOpTest, NEONAlignedConvNxNS12) {
TestAlignedConvNxNS12<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestAlignedConvNxNS12<DeviceType::OPENCL>();
}
template <DeviceType D>
static void TestUnalignedConvNxNS12() {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
......@@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
index_t width = 113;
index_t output_channels = 3 + rand() % 10;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
......@@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<DeviceType::CPU, float>(
net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
// run cpu
net.RunOp();
net.AddRandomInput<D, float>("Bias", {output_channels});
// Run on device
net.RunOp(D);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
};
......@@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
}
}
}
TEST_F(Conv2dOpTest, NEONUnalignedConvNxNS12) {
TestUnalignedConvNxNS12<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestUnalignedConvNxNS12<DeviceType::OPENCL>();
}
......@@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters,
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
......@@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters,
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册