提交 b7a95857 编写于 作者: Y yejianwu

update relu buffer to image

上级 fd284f6a
#include <common.h>
// Supported data type: half/float
__kernel void relu(__global const DATA_TYPE *input,
__private const int size,
__global DATA_TYPE *output) {
int idx = get_global_id(0);
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
const int width = get_global_size(1);
if (idx + 4 > size) {
for(; idx < size; ++idx) {
*(output+idx) = fmax(*(input+idx), 0);
}
} else {
VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
vstore4(data, idx, output);
}
const int pos = ch_blk * width + w;
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = fmax(in, (DATA_TYPE4)0);
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
__kernel void relux(__global const DATA_TYPE *input,
__kernel void relux(__read_only image2d_t input,
__private const DATA_TYPE max_limit,
__private const int size,
__global DATA_TYPE *output) {
int idx = get_global_id(0);
__write_only image2d_t output) {
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
const int width = get_global_size(1);
if (idx + 4 > size) {
for(; idx < size; ++idx) {
*(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
}
} else {
VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
vstore4(data, idx, output);
}
const int pos = ch_blk * width + w;
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = clamp(in, (DATA_TYPE4)0, (DATA_TYPE4)max_limit);
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
......@@ -6,58 +6,70 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <>
void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
template <typename T>
void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
index_t element_size = input->NumElements();
index_t blocks = (element_size + 3) / 4;
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const uint32_t gws = blocks;
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (max_limit_ < 0) {
auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws[3] = {1, kwg_size, 1};
uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws),
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
} else {
auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws[3] = {1, kwg_size, 1};
uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, max_limit_);
relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws),
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
}
template
struct ReluFunctor<DeviceType::OPENCL, float>;
template
struct ReluFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -33,11 +33,15 @@ struct ReluFunctor {
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor *output);
template <>
void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output);
template <typename T>
struct ReluFunctor<DeviceType::OPENCL, T> {
T max_limit_;
void operator()(const Tensor *input, Tensor *output);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_RELU_H_
\ No newline at end of file
#endif // MACE_KERNELS_RELU_H_
......@@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Relu, ReluOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<half>("T")
.Build(),
ReluOp<DeviceType::OPENCL, half>);
} // namespace mace
......@@ -16,7 +16,7 @@ class ReluOp : public Operator<D, T> {
ReluOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
functor_.max_limit_ =
OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
}
bool Run() override {
const Tensor *input_tensor = this->inputs_[0];
......
......@@ -9,17 +9,28 @@
namespace mace {
template <DeviceType D, typename T>
static void ReluBenchmark(int iters, int size) {
static void ReluBenchmark(
int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder("Relu", "ReluBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {size});
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluBM")
.Input("InputImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Relu", "ReluBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// Warm-up
for (int i = 0; i < 5; ++i) {
......@@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) {
net.Sync();
}
#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \
static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, SIZE); \
} \
BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE)
#define BM_RELU(SIZE, TYPE) \
BM_RELU_MACRO(SIZE, TYPE, CPU); \
BM_RELU_MACRO(SIZE, TYPE, NEON);\
BM_RELU_MACRO(SIZE, TYPE, OPENCL);
BM_RELU(1000, float);
BM_RELU(100000, float);
BM_RELU(10000000, float);
} // namespace mace
\ No newline at end of file
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W, TYPE) \
BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
BM_RELU_MACRO(N, C, H, W, TYPE, NEON);\
BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
BM_RELU(1, 1, 512, 512, float);
BM_RELU(1, 3, 128, 128, float);
BM_RELU(1, 3, 512, 512, float);
BM_RELU(1, 32, 112, 112, float);
BM_RELU(1, 64, 256, 256, float);
} // namespace mace
......@@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {};
template <DeviceType D>
void TestSimple() {
OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input",
......@@ -23,8 +19,28 @@ void TestSimple() {
{-7, 7, -6, 6, -5, 5, -4, 4,
-3, 3, -2, 2, -1, 1, 0, 0});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({2, 2, 2, 2},
{0, 7, 0, 6, 0, 5, 0, 4,
......@@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) {
template <DeviceType D>
void TestUnalignedSimple() {
OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input",
{1, 1, 3, 2},
{1, 3, 2, 1},
{-7, 7, -6, 6, -5, 5});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 3, 2},
auto expected = CreateTensor<float>({1, 3, 2, 1},
{0, 7, 0, 6, 0, 5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
......@@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
template <DeviceType D>
void TestSimpleReluX() {
OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input",
......@@ -94,8 +121,30 @@ void TestSimpleReluX() {
{-7, 7, -6, 6, -5, 5, -4, 4,
-3, 3, -2, 2, -1, 1, 0, 0});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({2, 2, 2, 2},
{0, 6, 0, 6, 0, 5, 0, 4,
......@@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) {
template <DeviceType D>
void TestUnalignedSimpleReluX() {
OpsTestNet net;
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input",
{1, 1, 1, 7},
{1, 1, 7, 1},
{-7, 7, -6, 6, -5, 5, -4});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Relu", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 1, 7},
auto expected = CreateTensor<float>({1, 1, 7, 1},
{0, 6, 0, 6, 0, 5, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册