提交 a9832dfb 编写于 作者: Y yejianwu

Merge branch 'master' of v9.git.n.xiaomi.com:deep-learning/mace into bm_to_image

......@@ -10,15 +10,23 @@
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
struct AddNFunctor {
void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
struct AddNFunctorBase {};
template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size();
T *output_ptr = output_tensor->mutable_data<T>();
memset(output_ptr, 0, size * sizeof(T));
int n = input_tensors.size();
for (int i = 0; i < n; ++i) {
MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
Tensor::MappingGuard input_map(input_tensors[i]);
const T *input_ptr = input_tensors[i]->data<T>();
for (index_t j = 0; j < size; ++j) {
......@@ -28,15 +36,17 @@ struct AddNFunctor {
}
};
template<>
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
template<>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(
std::vector<const Tensor *> &inputs, Tensor *output);
template <typename T>
struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
#endif // MACE_KERNELS_ADDN_H_
......@@ -10,7 +10,7 @@ namespace kernels {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
// TODO: neon mem copy
index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>();
......@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -5,52 +5,83 @@
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
index_t element_size = input0->NumElements();
index_t blocks = (element_size + 3) / 4;
template <typename T>
static void AddN(const std::vector<const Tensor *> &input_tensors,
Tensor *output) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const uint32_t gws = blocks;
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(output->dtype()));
auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0;
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
for (auto input : input_tensors) {
addn_kernel.setArg(idx++,
*(static_cast<const cl::Image2D *>(input->buffer())));
}
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
cl::NDRange(width_pixels, batch_height_pixels),
cl::NDRange(64, 16), // TODO fix this
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
}
template<>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
if (input_tensors.empty() || input_tensors.front() == nullptr) {
return;
}
template <typename T>
void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
switch (size) {
case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
break;
default:MACE_NOT_IMPLEMENTED;
for (int i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
};
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
} // namespace mace
#include <common.h>
// Supported data type: half/float
__kernel void add2(__global const DATA_TYPE *input0,
__global const DATA_TYPE *input1,
__private const int size,
__global DATA_TYPE *output) {
int idx = get_global_id(0);
__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
#if INPUT_NUM > 2
__read_only image2d_t input2,
#endif
#if INPUT_NUM > 3
__read_only image2d_t input3,
#endif
__write_only image2d_t output) {
const int w = get_global_id(0);
const int hb = get_global_id(1);
if (idx + 4 > size) {
for(; idx < size; ++idx) {
*(output+idx) = *(input0+idx) + *(input1+idx);
}
} else {
VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
vstore4(in_data0+in_data1, idx, output);
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
DATA_TYPE4 out = in0 + in1;
#if INPUT_NUM > 2
DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
out = out + in2;
#endif
#if INPUT_NUM > 3
DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
out = out + in3;
#endif
WRITE_IMAGET(output, (int2)(w, hb), out);
}
......@@ -23,4 +23,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.Build(),
AddNOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::OPENCL, half>);
} // namespace mace
......@@ -10,7 +10,7 @@
namespace mace {
template<DeviceType D, class T>
template <DeviceType D, class T>
class AddNOp : public Operator<D, T> {
public:
AddNOp(const OperatorDef &operator_def, Workspace *ws)
......@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
bool Run() override {
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr);
for (int i = 0; i < n; ++i) {
......
......@@ -9,47 +9,69 @@
namespace mace {
template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int n, int size) {
static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < n; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
// Add input data
for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<D, float>(
internal::MakeString("Input", i).c_str(), {n, h, w, c});
}
op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
// Add input data
for (int i = 0; i < n; ++i) {
net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size});
if (D == DeviceType::OPENCL) {
for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
internal::MakeString("InputImage", i).c_str(),
kernels::BufferType::IN_OUT);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
}
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Sync();
}
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Sync();
}
}
#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \
static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * SIZE; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE); \
} \
BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
#define BM_ADDN(N, SIZE, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
BM_ADDN(10, 1000, float);
BM_ADDN(10, 10000, float);
BM_ADDN(100, 1000, float);
BM_ADDN(100, 10000, float);
} // namespace mace
\ No newline at end of file
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN(2, 1, 240, 240, 256, float);
// BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN(4, 1, 240, 240, 256, float);
// BM_ADDN(4, 1, 240, 240, 256, half);
} // namespace mace
......@@ -9,7 +9,7 @@ namespace mace {
class AddnOpTest : public OpsTestBase {};
template<DeviceType D>
template <DeviceType D>
void SimpleAdd2() {
// Construct graph
OpsTestNet net;
......@@ -20,30 +20,26 @@ void SimpleAdd2() {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run
net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(AddnOpTest, CPUSimpleAdd2) {
SimpleAdd2<DeviceType::CPU>();
}
TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
TEST_F(AddnOpTest, NEONSimpleAdd2) {
SimpleAdd2<DeviceType::NEON>();
}
/*
TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
SimpleAdd2<DeviceType::OPENCL>();
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
*/
template<DeviceType D>
template <DeviceType D>
void SimpleAdd3() {
// Construct graph
OpsTestNet net;
......@@ -55,62 +51,80 @@ void SimpleAdd3() {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run
net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(AddnOpTest, CPUSimpleAdd3) {
SimpleAdd3<DeviceType::CPU>();
}
TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
TEST_F(AddnOpTest, NEONSimpleAdd3) {
SimpleAdd3<DeviceType::NEON>();
}
/*
TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
*/
template<DeviceType D>
template <DeviceType D>
void RandomTest() {
// Construct graph
OpsTestNet net;
OpDefBuilder("AddN", "AddNTest")
.Input("Input1")
.Input("Input2")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
// Check
net.RunOp(D);
Tensor result;
result.Copy(*net.GetOutput("Output"));
// Run
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
}
TEST_F(AddnOpTest, CPURandom) {
RandomTest<DeviceType::CPU>();
testing::internal::LogToStderr();
srand(time(NULL));
for (int round = 0; round < 10; ++round) {
// generate random input
index_t n = 1 + (rand() % 5);
index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
index_t c = 1 + (rand() % 32);
int input_num = 2 + rand() % 3;
// Construct graph
OpsTestNet net;
auto op_def = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def.Input("Input" + ToString(i));
}
op_def.Output("Output").Finalize(net.NewOperatorDef());
// Add input data
for (int i = 0; i < input_num; ++i) {
net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
}
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(net, "Input" + ToString(i),
"InputImage" + ToString(i),
kernels::BufferType::IN_OUT);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input("InputImage" + ToString(i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
}
}
TEST_F(AddnOpTest, NEONRandom) {
RandomTest<DeviceType::NEON>();
}
TEST_F(AddnOpTest, OPENCLRandom) {
RandomTest<DeviceType::OPENCL>();
}
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册