提交 a9832dfb 编写于 作者: Y yejianwu

Merge branch 'master' of v9.git.n.xiaomi.com:deep-learning/mace into bm_to_image

...@@ -10,15 +10,23 @@ ...@@ -10,15 +10,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<DeviceType D, typename T> struct AddNFunctorBase {};
struct AddNFunctor {
void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size(); index_t size = input_tensors[0]->size();
T *output_ptr = output_tensor->mutable_data<T>(); T *output_ptr = output_tensor->mutable_data<T>();
memset(output_ptr, 0, size * sizeof(T)); memset(output_ptr, 0, size * sizeof(T));
int n = input_tensors.size(); int n = input_tensors.size();
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
Tensor::MappingGuard input_map(input_tensors[i]); Tensor::MappingGuard input_map(input_tensors[i]);
const T *input_ptr = input_tensors[i]->data<T>(); const T *input_ptr = input_tensors[i]->data<T>();
for (index_t j = 0; j < size; ++j) { for (index_t j = 0; j < size; ++j) {
...@@ -28,15 +36,17 @@ struct AddNFunctor { ...@@ -28,15 +36,17 @@ struct AddNFunctor {
} }
}; };
template<> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor); const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()( struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
std::vector<const Tensor *> &inputs, Tensor *output); void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_ADDN_H_ #endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
...@@ -10,7 +10,7 @@ namespace kernels { ...@@ -10,7 +10,7 @@ namespace kernels {
template <> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
// TODO: neon mem copy // TODO: neon mem copy
index_t size = output_tensor->size(); index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>(); float *output_ptr = output_tensor->mutable_data<float>();
...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()( ...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -5,52 +5,83 @@ ...@@ -5,52 +5,83 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { template <typename T>
index_t element_size = input0->NumElements(); static void AddN(const std::vector<const Tensor *> &input_tensors,
index_t blocks = (element_size + 3) / 4; Tensor *output) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const uint32_t gws = blocks; const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(output->dtype())); auto dt = DataTypeToEnum<T>::value;
auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0; uint32_t idx = 0;
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer()))); for (auto input : input_tensors) {
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer()))); addn_kernel.setArg(idx++,
addn_kernel.setArg(idx++, static_cast<int32_t>(element_size)); *(static_cast<const cl::Image2D *>(input->buffer())));
addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); }
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange, addn_kernel, cl::NullRange,
cl::NDRange(gws), cl::NDRange(width_pixels, batch_height_pixels),
cl::NDRange(lws), cl::NDRange(64, 16), // TODO fix this
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
} }
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors, void AddNFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
if (input_tensors.empty() || input_tensors.front() == nullptr) {
return;
}
size_t size = input_tensors.size(); size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
switch (size) { for (int i = 1; i < size; ++i) {
case 2:Add2(input_tensors[0], input_tensors[1], output_tensor); MACE_CHECK_NOTNULL(input_tensors[i]);
break; MACE_CHECK(batch == input_tensors[i]->dim(0));
default:MACE_NOT_IMPLEMENTED; MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
}; };
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#include <common.h> #include <common.h>
// Supported data type: half/float __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__kernel void add2(__global const DATA_TYPE *input0, __read_only image2d_t input1,
__global const DATA_TYPE *input1, #if INPUT_NUM > 2
__private const int size, __read_only image2d_t input2,
__global DATA_TYPE *output) { #endif
int idx = get_global_id(0); #if INPUT_NUM > 3
__read_only image2d_t input3,
#endif
__write_only image2d_t output) {
const int w = get_global_id(0);
const int hb = get_global_id(1);
if (idx + 4 > size) { const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for(; idx < size; ++idx) {
*(output+idx) = *(input0+idx) + *(input1+idx); DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
} DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
} else { DATA_TYPE4 out = in0 + in1;
VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1); #if INPUT_NUM > 2
vstore4(in_data0+in_data1, idx, output); DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
} out = out + in2;
#endif
#if INPUT_NUM > 3
DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
out = out + in3;
#endif
WRITE_IMAGET(output, (int2)(w, hb), out);
} }
...@@ -23,4 +23,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN") ...@@ -23,4 +23,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.Build(), .Build(),
AddNOp<DeviceType::OPENCL, float>); AddNOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
namespace mace { namespace mace {
template<DeviceType D, class T> template <DeviceType D, class T>
class AddNOp : public Operator<D, T> { class AddNOp : public Operator<D, T> {
public: public:
AddNOp(const OperatorDef &operator_def, Workspace *ws) AddNOp(const OperatorDef &operator_def, Workspace *ws)
...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> { ...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
bool Run() override { bool Run() override {
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
int n = this->inputs_.size(); int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr); vector<const Tensor *> inputs(n, nullptr);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
...@@ -9,47 +9,69 @@ ...@@ -9,47 +9,69 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int n, int size) { static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpDefBuilder op_def_builder("AddN", "AddNBM"); // Add input data
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str()); net.AddRandomInput<D, float>(
internal::MakeString("Input", i).c_str(), {n, h, w, c});
} }
op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
// Add input data if (D == DeviceType::OPENCL) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size}); BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
internal::MakeString("InputImage", i).c_str(),
kernels::BufferType::IN_OUT);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} }
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
} }
#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \ #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \ static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
const int64_t tot = static_cast<int64_t>(iters) * N * SIZE; \ int iters) { \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::ItemsProcessed(tot); \
AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
} \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE) } \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(N, SIZE, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \ #define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, NEON); BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN(10, 1000, float);
BM_ADDN(10, 10000, float); BM_ADDN(2, 1, 240, 240, 256, float);
BM_ADDN(100, 1000, float); // BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN(100, 10000, float); BM_ADDN(4, 1, 240, 240, 256, float);
} // namespace mace // BM_ADDN(4, 1, 240, 240, 256, half);
\ No newline at end of file
} // namespace mace
...@@ -9,7 +9,7 @@ namespace mace { ...@@ -9,7 +9,7 @@ namespace mace {
class AddnOpTest : public OpsTestBase {}; class AddnOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void SimpleAdd2() { void SimpleAdd2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -20,30 +20,26 @@ void SimpleAdd2() { ...@@ -20,30 +20,26 @@ void SimpleAdd2() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd2) { TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
SimpleAdd2<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd2) { /*
SimpleAdd2<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
SimpleAdd2<DeviceType::OPENCL>(); */
}
template<DeviceType D> template <DeviceType D>
void SimpleAdd3() { void SimpleAdd3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -55,62 +51,80 @@ void SimpleAdd3() { ...@@ -55,62 +51,80 @@ void SimpleAdd3() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd3) { TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
SimpleAdd3<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd3) { /*
SimpleAdd3<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
} */
template<DeviceType D> template <DeviceType D>
void RandomTest() { void RandomTest() {
// Construct graph testing::internal::LogToStderr();
OpsTestNet net; srand(time(NULL));
OpDefBuilder("AddN", "AddNTest")
.Input("Input1") for (int round = 0; round < 10; ++round) {
.Input("Input2") // generate random input
.Output("Output") index_t n = 1 + (rand() % 5);
.Finalize(net.NewOperatorDef()); index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
// Add input data index_t c = 1 + (rand() % 32);
net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4}); int input_num = 2 + rand() % 3;
net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4}); // Construct graph
OpsTestNet net;
// Check auto op_def = OpDefBuilder("AddN", "AddNTest");
net.RunOp(D); for (int i = 0; i < input_num; ++i) {
op_def.Input("Input" + ToString(i));
Tensor result; }
result.Copy(*net.GetOutput("Output")); op_def.Output("Output").Finalize(net.NewOperatorDef());
// Run // Add input data
net.RunOp(); for (int i = 0; i < input_num; ++i) {
net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5); }
}
// run on cpu
TEST_F(AddnOpTest, CPURandom) { net.RunOp();
RandomTest<DeviceType::CPU>(); // Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(net, "Input" + ToString(i),
"InputImage" + ToString(i),
kernels::BufferType::IN_OUT);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input("InputImage" + ToString(i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
}
} }
TEST_F(AddnOpTest, NEONRandom) { TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
RandomTest<DeviceType::NEON>();
}
TEST_F(AddnOpTest, OPENCLRandom) {
RandomTest<DeviceType::OPENCL>();
}
} // namespace mace } // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册