提交 d4b8a5e0 编写于 作者: 刘琦

Merge branch 'master' into 'master'

Finish add2 opencl kernel and update tf converter to support icnet.

See merge request !100
......@@ -33,8 +33,8 @@ class OpenCLRuntime {
private:
cl::Context context_;
cl::CommandQueue command_queue_;
cl::Device device_;
cl::CommandQueue command_queue_;
cl::Program program_;
std::once_flag build_flag_;
};
......
......@@ -10,22 +10,31 @@
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
template<DeviceType D, typename T>
struct AddNFunctor {
void operator()(const vector<const T *> &inputs, T *output, index_t size) {
memset(output, 0, size * sizeof(T));
int n = inputs.size();
void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size();
T *output_ptr = output_tensor->mutable_data<T>();
memset(output_ptr, 0, size * sizeof(T));
int n = input_tensors.size();
for (int i = 0; i < n; ++i) {
Tensor::MappingGuard input_map(input_tensors[i]);
const T *input_ptr = input_tensors[i]->data<T>();
for (index_t j = 0; j < size; ++j) {
output[j] += inputs[i][j];
output_ptr[j] += input_ptr[j];
}
}
}
};
template <>
template<>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const vector<const float *> &inputs, float *output, index_t size);
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
template<>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(
std::vector<const Tensor *> &inputs, Tensor *output);
} // namespace kernels
} // namespace mace
......
......@@ -10,10 +10,12 @@ namespace kernels {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const vector<const float *> &inputs, float *output, index_t size) {
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
// TODO: neon mem copy
memset(output, 0, size * sizeof(float));
int n = inputs.size();
index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>();
memset(output_ptr, 0, size * sizeof(float));
int n = input_tensors.size();
int64_t cost = size * n;
int64_t groups = 1;
if (cost > kCostPerGroup) {
......@@ -27,8 +29,9 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
int nn = count >> 2;
int remain = count - (nn << 2);
for (int64_t j = 0; j < n; ++j) {
const float *inptr = inputs[j] + i;
float *outptr = output + i;
const float *input_base = input_tensors[j]->data<float>();
const float *inptr = input_base + i;
float *outptr = output_ptr + i;
for (int k = 0; k < nn; ++k) {
float32x4_t _inptr = vld1q_f32(inptr);
float32x4_t _outptr = vld1q_f32(outptr);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
namespace kernels {
static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
index_t element_size = input0->NumElements();
index_t blocks = (element_size + 3) / 4;
const uint32_t gws = blocks;
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto addn_kernel = cl::Kernel(program, "add2");
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0;
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws));
MACE_CHECK(error == CL_SUCCESS);
}
template<>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
if (input_tensors.empty() || input_tensors.front() == nullptr) {
return;
}
size_t size = input_tensors.size();
switch (size) {
case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
break;
default:MACE_NOT_IMPLEMENTED;
}
};
} // namespace kernels
} // namespace mace
__kernel void add2(__global const float *input0,
__global const float *input1,
__private const int size,
__global float *output) {
int idx = get_global_id(0);
if (idx + 4 > size) {
for(; idx < size; ++idx) {
*(output+idx) = *(input0+idx) + *(input1+idx);
}
} else {
float4 in_data0 = vload4(idx, input0);
float4 in_data1 = vload4(idx, input1);
vstore4(in_data0+in_data1, idx, output);
}
}
......@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>);
} // namespace mace
......@@ -10,7 +10,7 @@
namespace mace {
template <DeviceType D, class T>
template<DeviceType D, class T>
class AddNOp : public Operator<D, T> {
public:
AddNOp(const OperatorDef &operator_def, Workspace *ws)
......@@ -19,16 +19,13 @@ class AddNOp : public Operator<D, T> {
bool Run() override {
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
T *output = output_tensor->mutable_data<T>();
index_t size = this->inputs_[0]->size();
int n = this->inputs_.size();
vector<const T *> inputs(n);
vector<const Tensor *> inputs(n, nullptr);
for (int i = 0; i < n; ++i) {
const Tensor *input_tensor = this->inputs_[i];
inputs[i] = input_tensor->data<T>();
inputs[i] = this->inputs_[i];
}
functor_(inputs, output, size);
functor_(inputs, output_tensor);
return true;
}
......
......@@ -9,9 +9,44 @@ namespace mace {
class AddnOpTest : public OpsTestBase {};
TEST_F(AddnOpTest, AddnOp) {
template<DeviceType D>
void SimpleAdd2() {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("AddN", "AddNTest")
.Input("Input1")
.Input("Input2")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
// Run
net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(AddnOpTest, CPUSimpleAdd2) {
SimpleAdd2<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd2) {
SimpleAdd2<DeviceType::NEON>();
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
SimpleAdd2<DeviceType::OPENCL>();
}
template<DeviceType D>
void SimpleAdd3() {
// Construct graph
OpsTestNet net;
OpDefBuilder("AddN", "AddNTest")
.Input("Input1")
.Input("Input2")
......@@ -20,20 +55,62 @@ TEST_F(AddnOpTest, AddnOp) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input1", {1, 2, 3, 4});
net.AddRandomInput<DeviceType::CPU, float>("Input2", {1, 2, 3, 4});
net.AddRandomInput<DeviceType::CPU, float>("Input3", {1, 2, 3, 4});
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
// Run
net.RunOp();
net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(AddnOpTest, CPUSimpleAdd3) {
SimpleAdd3<DeviceType::CPU>();
}
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
TEST_F(AddnOpTest, NEONSimpleAdd3) {
SimpleAdd3<DeviceType::NEON>();
}
template<DeviceType D>
void RandomTest() {
// Construct graph
OpsTestNet net;
OpDefBuilder("AddN", "AddNTest")
.Input("Input1")
.Input("Input2")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
// Check
net.RunOp(DeviceType::NEON);
net.RunOp(D);
Tensor result;
result.Copy(*net.GetOutput("Output"));
// Run
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
}
TEST_F(AddnOpTest, CPURandom) {
RandomTest<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONRandom) {
RandomTest<DeviceType::NEON>();
}
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
TEST_F(AddnOpTest, OPENCLRandom) {
RandomTest<DeviceType::OPENCL>();
}
} // namespace mace
......@@ -12,6 +12,33 @@ pooling_type_mode = {
'MaxPool': 2
}
def convert_tensor(op, tensor):
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
if (op.name.find('pointwise_kernel') != -1 or
op.name.find('depthwise_kernel') != -1 or
op.name.endswith('weights') or
op.name.endswith('kernel')) \
and op.outputs[0].consumers()[0].type.find('Conv') != -1:
if op.outputs[0].consumers()[0].get_attr('data_format') == 'NCHW':
tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
shape = [shape[3], shape[2], shape[0], shape[1]]
# print (tensor.name, shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
......@@ -24,26 +51,11 @@ def convert_ops(unresolved_ops, net_def):
first_op = unresolved_ops[0]
if first_op.type == 'Placeholder' or first_op.type == 'Reshape':
if first_op.type in ['Placeholder', 'Reshape', 'Identity']:
pass
elif first_op.type == 'Const':
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
# TODO: support other type than float
tensor.data_type = mace_pb2.DT_FLOAT
shape = list(tf_tensor.shape)
if (first_op.name.find('pointwise_kernel') != -1 or
first_op.name.find('depthwise_kernel') != -1 or
first_op.name.endswith('weights') or
first_op.name.endswith('kernel')) \
and first_op.outputs[0].consumers()[0].type.find('Conv') != -1:
tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
shape = [shape[3], shape[2], shape[0], shape[1]]
# print (tensor.name, shape)
tensor.dims.extend(shape)
tensor.float_data.extend(tf_tensor.astype(float).flat)
convert_tensor(first_op, tensor)
elif first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative':
op_def = net_def.op.add()
op_def.name = first_op.name
......@@ -61,9 +73,7 @@ def convert_ops(unresolved_ops, net_def):
strides_arg.ints.extend(first_op.get_attr('strides')[2:])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = first_op.get_attr('data_format')
if first_op.get_attr('data_format') != 'NCHW':
raise Exception('only support NCHW now')
data_format_arg.s = 'NCHW'
if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
bias_add_op = unresolved_ops[1]
......@@ -78,7 +88,8 @@ def convert_ops(unresolved_ops, net_def):
sub_op = unresolved_ops[5]
add_1_op = unresolved_ops[6]
# print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type)
if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add':
if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \
mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add':
raise Exception('Invalid BatchNorm Op')
input_name = get_input_tensor(mul_1_op, 0).name
......@@ -104,12 +115,6 @@ def convert_ops(unresolved_ops, net_def):
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
elif first_op.type == 'Relu':
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool':
op_def = net_def.op.add()
op_def.name = first_op.name
......@@ -130,9 +135,19 @@ def convert_ops(unresolved_ops, net_def):
kernels_arg.ints.extend(first_op.get_attr('ksize')[2:])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = first_op.get_attr('data_format')
if first_op.get_attr('data_format') != 'NCHW':
raise Exception('only support NCHW now')
data_format_arg.s = 'NCHW'
elif first_op.type == 'Add':
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']:
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
else:
raise Exception('Unknown Op: ' + first_op.name)
pass
......@@ -152,4 +167,6 @@ def convert_to_mace_pb(input_graph_def):
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, net_def)
print "Done."
return net_def
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册