提交 b7a95857 编写于 作者: Y yejianwu

update relu buffer to image

上级 fd284f6a
#include <common.h> #include <common.h>
// Supported data type: half/float // Supported data type: half/float
__kernel void relu(__global const DATA_TYPE *input, __kernel void relu(__read_only image2d_t input,
__private const int size, __write_only image2d_t output) {
__global DATA_TYPE *output) { const int ch_blk = get_global_id(0);
int idx = get_global_id(0); const int w = get_global_id(1);
const int hb = get_global_id(2);
const int width = get_global_size(1);
if (idx + 4 > size) { const int pos = ch_blk * width + w;
for(; idx < size; ++idx) { DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
*(output+idx) = fmax(*(input+idx), 0); DATA_TYPE4 out = fmax(in, (DATA_TYPE4)0);
} WRITE_IMAGET(output, (int2)(pos, hb), out);
} else {
VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
vstore4(data, idx, output);
}
} }
__kernel void relux(__global const DATA_TYPE *input, __kernel void relux(__read_only image2d_t input,
__private const DATA_TYPE max_limit, __private const DATA_TYPE max_limit,
__private const int size, __write_only image2d_t output) {
__global DATA_TYPE *output) { const int ch_blk = get_global_id(0);
int idx = get_global_id(0); const int w = get_global_id(1);
const int hb = get_global_id(2);
const int width = get_global_size(1);
if (idx + 4 > size) { const int pos = ch_blk * width + w;
for(; idx < size; ++idx) { DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
*(output+idx) = clamp(*(input+idx), 0.0f, max_limit); DATA_TYPE4 out = clamp(in, (DATA_TYPE4)0, (DATA_TYPE4)max_limit);
} WRITE_IMAGET(output, (int2)(pos, hb), out);
} else {
VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
vstore4(data, idx, output);
}
} }
...@@ -6,58 +6,70 @@ ...@@ -6,58 +6,70 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <> template <typename T>
void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) { Tensor *output) {
index_t element_size = input->NumElements(); const index_t batch = input->dim(0);
index_t blocks = (element_size + 3) / 4; const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const uint32_t gws = blocks; const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (max_limit_ < 0) { if (max_limit_ < 0) {
auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options); auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws[3] = {1, kwg_size, 1};
uint32_t idx = 0; uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange, relu_kernel, cl::NullRange,
cl::NDRange(gws), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws), cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
} else { } else {
auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options); auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t lws[3] = {1, kwg_size, 1};
uint32_t idx = 0; uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, max_limit_); relu_kernel.setArg(idx++, max_limit_);
relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange, relu_kernel, cl::NullRange,
cl::NDRange(gws), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws), cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
} }
} }
template
struct ReluFunctor<DeviceType::OPENCL, float>;
template
struct ReluFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -33,11 +33,15 @@ struct ReluFunctor { ...@@ -33,11 +33,15 @@ struct ReluFunctor {
template <> template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor *output); Tensor *output);
template <>
void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, template <typename T>
Tensor *output); struct ReluFunctor<DeviceType::OPENCL, T> {
T max_limit_;
void operator()(const Tensor *input, Tensor *output);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_RELU_H_ #endif // MACE_KERNELS_RELU_H_
\ No newline at end of file
...@@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>); ...@@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Relu, ReluOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<half>("T")
.Build(),
ReluOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -16,7 +16,7 @@ class ReluOp : public Operator<D, T> { ...@@ -16,7 +16,7 @@ class ReluOp : public Operator<D, T> {
ReluOp(const OperatorDef &operator_def, Workspace *ws) ReluOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) { : Operator<D, T>(operator_def, ws) {
functor_.max_limit_ = functor_.max_limit_ =
OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1)); OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
} }
bool Run() override { bool Run() override {
const Tensor *input_tensor = this->inputs_[0]; const Tensor *input_tensor = this->inputs_[0];
......
...@@ -9,17 +9,28 @@ ...@@ -9,17 +9,28 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void ReluBenchmark(int iters, int size) { static void ReluBenchmark(
int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Relu", "ReluBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {size}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluBM")
.Input("InputImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Relu", "ReluBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) { ...@@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) {
net.Sync(); net.Sync();
} }
#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \ #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \ static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, SIZE); \ ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE) BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE)
#define BM_RELU(SIZE, TYPE) \ #define BM_RELU(N, C, H, W, TYPE) \
BM_RELU_MACRO(SIZE, TYPE, CPU); \ BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
BM_RELU_MACRO(SIZE, TYPE, NEON);\ BM_RELU_MACRO(N, C, H, W, TYPE, NEON);\
BM_RELU_MACRO(SIZE, TYPE, OPENCL); BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
BM_RELU(1000, float); BM_RELU(1, 1, 512, 512, float);
BM_RELU(100000, float); BM_RELU(1, 3, 128, 128, float);
BM_RELU(10000000, float); BM_RELU(1, 3, 512, 512, float);
} // namespace mace BM_RELU(1, 32, 112, 112, float);
\ No newline at end of file BM_RELU(1, 64, 256, 256, float);
} // namespace mace
...@@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {}; ...@@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {};
template <DeviceType D> template <DeviceType D>
void TestSimple() { void TestSimple() {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", net.AddInputFromArray<D, float>("Input",
...@@ -23,8 +19,28 @@ void TestSimple() { ...@@ -23,8 +19,28 @@ void TestSimple() {
{-7, 7, -6, 6, -5, 5, -4, 4, {-7, 7, -6, 6, -5, 5, -4, 4,
-3, 3, -2, 2, -1, 1, 0, 0}); -3, 3, -2, 2, -1, 1, 0, 0});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({2, 2, 2, 2}, auto expected = CreateTensor<float>({2, 2, 2, 2},
{0, 7, 0, 6, 0, 5, 0, 4, {0, 7, 0, 6, 0, 5, 0, 4,
...@@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) { ...@@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) {
template <DeviceType D> template <DeviceType D>
void TestUnalignedSimple() { void TestUnalignedSimple() {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", net.AddInputFromArray<D, float>("Input",
{1, 1, 3, 2}, {1, 3, 2, 1},
{-7, 7, -6, 6, -5, 5}); {-7, 7, -6, 6, -5, 5});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 3, 2}, auto expected = CreateTensor<float>({1, 3, 2, 1},
{0, 7, 0, 6, 0, 5}); {0, 7, 0, 6, 0, 5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) { ...@@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
template <DeviceType D> template <DeviceType D>
void TestSimpleReluX() { void TestSimpleReluX() {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", net.AddInputFromArray<D, float>("Input",
...@@ -94,8 +121,30 @@ void TestSimpleReluX() { ...@@ -94,8 +121,30 @@ void TestSimpleReluX() {
{-7, 7, -6, 6, -5, 5, -4, 4, {-7, 7, -6, 6, -5, 5, -4, 4,
-3, 3, -2, 2, -1, 1, 0, 0}); -3, 3, -2, 2, -1, 1, 0, 0});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({2, 2, 2, 2}, auto expected = CreateTensor<float>({2, 2, 2, 2},
{0, 6, 0, 6, 0, 5, 0, 4, {0, 6, 0, 6, 0, 5, 0, 4,
...@@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) { ...@@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) {
template <DeviceType D> template <DeviceType D>
void TestUnalignedSimpleReluX() { void TestUnalignedSimpleReluX() {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", net.AddInputFromArray<D, float>("Input",
{1, 1, 1, 7}, {1, 1, 7, 1},
{-7, 7, -6, 6, -5, 5, -4}); {-7, 7, -6, 6, -5, 5, -4});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 1, 7}, auto expected = CreateTensor<float>({1, 1, 7, 1},
{0, 6, 0, 6, 0, 5, 0}); {0, 6, 0, 6, 0, 5, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册