提交 6c8cc84e 编写于 作者: 李寅

Merge branch 'diff-input' into 'master'

Support different input shapes.

See merge request !397
...@@ -3,6 +3,7 @@ stages: ...@@ -3,6 +3,7 @@ stages:
- pycodestyle - pycodestyle
- platform_compitable_tests - platform_compitable_tests
- ops_test - ops_test
- api_test
- ops_benchmark - ops_benchmark
- extra_tests - extra_tests
...@@ -21,7 +22,13 @@ ops_test: ...@@ -21,7 +22,13 @@ ops_test:
stage: ops_test stage: ops_test
script: script:
- if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
- python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
api_test:
stage: api_test
script:
- if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
ops_benchmark: ops_benchmark:
stage: ops_benchmark stage: ops_benchmark
......
...@@ -178,6 +178,9 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -178,6 +178,9 @@ MaceStatus MaceEngine::Impl::Run(
std::vector<Tensor *> input_tensors; std::vector<Tensor *> input_tensors;
std::vector<Tensor *> output_tensors; std::vector<Tensor *> output_tensors;
for (auto &input : inputs) { for (auto &input : inputs) {
MACE_CHECK(input.second.shape().size() == 4,
"The Inputs' shape must be 4-dimension with NHWC format,"
" please use 1 to fill missing dimensions");
Tensor *input_tensor = Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first, ":0")); ws_->GetTensor(MakeString("mace_input_node_", input.first, ":0"));
input_tensor->Resize(input.second.shape()); input_tensor->Resize(input.second.shape());
...@@ -190,6 +193,11 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -190,6 +193,11 @@ MaceStatus MaceEngine::Impl::Run(
input_tensors.push_back(input_tensor); input_tensors.push_back(input_tensor);
} }
for (auto &output : *outputs) { for (auto &output : *outputs) {
if (device_type_ == DeviceType::OPENCL) {
MACE_CHECK(output.second.shape().size() == 4,
"The outputs' shape must be 4-dimension with NHWC format,"
" please use 1 to fill missing dimensions");
}
Tensor *output_tensor = Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0")); ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
output_tensors.push_back(output_tensor); output_tensors.push_back(output_tensor);
......
...@@ -81,15 +81,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -81,15 +81,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
} }
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU || type == DeviceType::NEON) { if (model_data_size > 0) {
tensor_buffer_ = std::unique_ptr<Buffer>( if (type == DeviceType::CPU || type == DeviceType::NEON) {
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)); tensor_buffer_ = std::unique_ptr<Buffer>(
} else { new Buffer(GetDeviceAllocator(type),
tensor_buffer_ = std::unique_ptr<Buffer>( model_data_ptr,
new Buffer(GetDeviceAllocator(type), model_data_size)); model_data_size));
tensor_buffer_->Map(nullptr); } else {
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size); tensor_buffer_ = std::unique_ptr<Buffer>(
tensor_buffer_->UnMap(); new Buffer(GetDeviceAllocator(type), model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap();
}
} }
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
......
...@@ -163,6 +163,8 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -163,6 +163,8 @@ bool RunModel(const std::vector<std::string> &input_names,
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} }
// DO NOT USE tmp directory.
// please use APP's own directory
const std::string kernel_file_path = const std::string kernel_file_path =
"/data/local/tmp/mace_run/cl"; "/data/local/tmp/mace_run/cl";
......
...@@ -28,9 +28,12 @@ cc_library( ...@@ -28,9 +28,12 @@ cc_library(
"opencl/*.h", "opencl/*.h",
"arm/*.h", "arm/*.h",
]), ]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]) + if_android_armv7(["-mfpu=neon -mfloat-abi=softfp"]) + if_android([ copts = if_openmp_enabled(["-fopenmp"]) +
"-DMACE_ENABLE_OPENCL", if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
linkopts = if_android(["-lm"]), linkopts = if_android(["-lm"]),
deps = [ deps = [
"//mace/core", "//mace/core",
...@@ -48,9 +51,12 @@ cc_test( ...@@ -48,9 +51,12 @@ cc_test(
"opencl/*_test.cc", "opencl/*_test.cc",
], ],
), ),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]) + if_android_armv7(["-mfpu=neon -mfloat-abi=softfp"]) + if_android([ copts = if_openmp_enabled(["-fopenmp"]) +
"-DMACE_ENABLE_OPENCL", if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
......
...@@ -34,9 +34,12 @@ cc_library( ...@@ -34,9 +34,12 @@ cc_library(
["*.h"], ["*.h"],
exclude = ["ops_test_util.h"], exclude = ["ops_test_util.h"],
), ),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]) + if_android_armv7(["-mfpu=neon -mfloat-abi=softfp"]) + if_android([ copts = if_openmp_enabled(["-fopenmp"]) +
"-DMACE_ENABLE_OPENCL", if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
deps = [ deps = [
"//mace/kernels", "//mace/kernels",
], ],
...@@ -49,9 +52,12 @@ cc_test( ...@@ -49,9 +52,12 @@ cc_test(
srcs = glob( srcs = glob(
["*_test.cc"], ["*_test.cc"],
), ),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]) + if_android_armv7(["-mfpu=neon -mfloat-abi=softfp"]) + if_android([ copts = if_openmp_enabled(["-fopenmp"]) +
"-DMACE_ENABLE_OPENCL", if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
...@@ -65,9 +71,12 @@ cc_test( ...@@ -65,9 +71,12 @@ cc_test(
name = "ops_benchmark", name = "ops_benchmark",
testonly = 1, testonly = 1,
srcs = glob(["*_benchmark.cc"]), srcs = glob(["*_benchmark.cc"]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]) + if_android_armv7(["-mfpu=neon -mfloat-abi=softfp"]) + if_android([ copts = if_openmp_enabled(["-fopenmp"]) +
"-DMACE_ENABLE_OPENCL", if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
......
...@@ -375,90 +375,92 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) { ...@@ -375,90 +375,92 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
namespace { namespace {
template<DeviceType D> template<DeviceType D>
void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) { void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape,
const int kernel, const int stride,
Padding type) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, // generate random input
Padding type) { srand(time(NULL));
// generate random input index_t batch = 3;
static unsigned int seed = time(NULL); index_t height = shape[0];
index_t batch = 3 + (rand_r(&seed) % 10); index_t width = shape[1];
index_t height = shape[0]; index_t input_channels = shape[2];
index_t width = shape[1]; index_t output_channels = shape[3];
index_t input_channels = shape[2] + (rand_r(&seed) % 10); // Construct graph
index_t output_channels = shape[3] + (rand_r(&seed) % 10); OpsTestNet net;
// Construct graph OpDefBuilder("FusedConv2D", "FusedConv2dTest")
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("Output") .Output("Output")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
std::vector<float> float_input_data; std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels}, GenerateRandomRealTypeData({batch, height, width, input_channels},
&float_input_data); &float_input_data);
std::vector<float> float_filter_data; std::vector<float> float_filter_data;
GenerateRandomRealTypeData( GenerateRandomRealTypeData(
{kernel_h, kernel_w, output_channels, input_channels}, {kernel, kernel, output_channels, input_channels},
&float_filter_data); &float_filter_data);
std::vector<float> float_bias_data; std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data); GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data); "Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels}, "Filter", {kernel, kernel, output_channels, input_channels},
float_filter_data); float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data); net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on cpu
BufferToImage<D, half>(&net, "Input", "InputImage", net.RunOp();
kernels::BufferType::IN_OUT_CHANNEL); // Check
BufferToImage<D, half>(&net, "Filter", "FilterImage", Tensor expected;
kernels::BufferType::CONV2D_FILTER); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); // run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage",
OpDefBuilder("FusedConv2D", "FusedConv2dTest") kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
.Input("BiasImage") .Input("BiasImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
};
for (int kernel_size : {1, 3}) { ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
for (int stride : {1, 2}) { 1e-2, 1e-1);
func(kernel_size, kernel_size, stride, stride, VALID);
}
}
} }
} // namespace } // namespace
TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) { TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}); TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 1, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 1, SAME);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 2, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 2, SAME);
}
TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 1, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 1, SAME);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 2, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 2, SAME);
} }
namespace { namespace {
......
...@@ -283,6 +283,16 @@ class OpsTestNet { ...@@ -283,6 +283,16 @@ class OpsTestNet {
return RunOp(DeviceType::CPU); return RunOp(DeviceType::CPU);
} }
bool RunNet(const NetDef &net_def, const DeviceType device) {
device_ = device;
net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
if (!net_->Run()) {
return false;
}
net_ = CreateNet(op_registry_, net_def, &ws_, device);
return net_->Run();
}
Tensor *GetOutput(const char *output_name) { Tensor *GetOutput(const char *output_name) {
return ws_.GetTensor(output_name); return ws_.GetTensor(output_name);
} }
......
...@@ -306,6 +306,13 @@ class CaffeConverter(object): ...@@ -306,6 +306,13 @@ class CaffeConverter(object):
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
input_op = self.ops_map[name]
if input_op.layer is not None:
output_shape = input_op.output_shape_map[input_op.layer.top[0]]
else:
output_shape = input_op.output_shape_map[input_op.name]
self.add_output_shape(op_def, output_shape)
def add_output_transform(self, names): def add_output_transform(self, names):
for name in names: for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
...@@ -1077,15 +1084,15 @@ class CaffeConverter(object): ...@@ -1077,15 +1084,15 @@ class CaffeConverter(object):
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes): def convert(self, input_nodes, input_shapes, output_nodes):
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
if self.device == 'gpu': if self.device == 'gpu':
self.add_input_transform(input_nodes) self.add_input_transform(input_nodes)
if self.device == 'neon': if self.device == 'neon':
self.add_neon_input_transform(input_nodes) self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
for op in self.ops: for op in self.ops:
if op.name in self.resolved_ops: if op.name in self.resolved_ops:
continue continue
......
...@@ -32,7 +32,11 @@ class MemoryOptimizer(object): ...@@ -32,7 +32,11 @@ class MemoryOptimizer(object):
self.ref_counter[tensor_name] = 0 self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op): def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' if op.type == 'BufferToImage':
for arg in op.arg:
if arg.name == 'mode' and arg.i == 0:
return True
return op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape): def get_mem_size(self, op_type, output_shape):
mem_size = [0, 0] mem_size = [0, 0]
......
...@@ -141,6 +141,8 @@ class TFConverter(object): ...@@ -141,6 +141,8 @@ class TFConverter(object):
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
self.add_output_shape(self.ops[name].outputs, op_def)
def add_neon_input_transform(self, names): def add_neon_input_transform(self, names):
for name in names: for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
......
# Description:
# Mace operators.
#
package(
default_visibility = ["//visibility:public"],
)
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled", "if_android_armv7", "if_hexagon_enabled")
cc_test(
name = "mace_api_test",
testonly = 1,
srcs = ["mace_api_test.cc"],
copts = if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
"//mace/ops:test",
"//mace/kernels:kernels",
"//mace/ops:ops",
"@gtest//:gtest_main",
],
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace test {
class MaceAPITest : public ::testing::Test {};
namespace {
void GenerateInputs(const std::vector<std::string> &input_names,
const std::vector<int64_t> &input_shape,
std::map<std::string, mace::MaceTensor> *inputs) {
size_t input_size = input_names.size();
for (size_t i = 0; i < input_size; ++i) {
// Allocate input and output
int64_t input_size =
std::accumulate(input_shape.begin(), input_shape.end(), 1,
std::multiplies<int64_t>());
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
// load input
std::vector<float> input_data;
ops::test::GenerateRandomRealTypeData(input_shape, &input_data);
memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float));
(*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in);
}
}
void GenerateOutputs(const std::vector<std::string> &output_names,
const std::vector<int64_t> &output_shape,
std::map<std::string, mace::MaceTensor> *outputs) {
size_t output_size = output_names.size();
for (size_t i = 0; i < output_size; ++i) {
int64_t output_size =
std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int64_t>());
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
(*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out);
}
}
template <typename T>
void BufferToImage(const std::string &input_name,
const std::string &output_name,
const int buffer_type,
const std::vector<int> &mem_ids,
NetDef *net_def,
const int mode = NetMode::NORMAL) {
OperatorDef operator_def;
ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", buffer_type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntArg("mode", mode)
.Finalize(&operator_def);
operator_def.set_mem_id(mem_ids);
net_def->add_op()->CopyFrom(operator_def);
}
template <typename T>
void ImageToBuffer(const std::string &input_name,
const std::string &output_name,
const int buffer_type,
NetDef *net_def) {
OperatorDef operator_def;
ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", buffer_type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(&operator_def);
net_def->add_op()->CopyFrom(operator_def);
}
template <typename T>
void Conv3x3(const std::string &input_name,
const std::string &filter_name,
const std::string &output_name,
const std::vector<int> &mem_ids,
NetDef *net_def) {
OperatorDef operator_def;
ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
.Input(input_name)
.Input(filter_name)
.Output(output_name)
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(&operator_def);
operator_def.set_mem_id(mem_ids);
net_def->add_op()->CopyFrom(operator_def);
}
template <typename T>
void Relu(const std::string &input_name,
const std::string &output_name,
NetDef *net_def) {
OperatorDef operator_def;
ops::test::OpDefBuilder("Activation", "ReluTest")
.Input(input_name)
.Output(output_name)
.AddStringArg("activation", "RELU")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(&operator_def);
net_def->add_op()->CopyFrom(operator_def);
}
template <typename T>
void AddTensor(const std::string &name,
const std::vector<int64_t> &shape,
T *data,
NetDef *net_def) {
ConstTensor tensor(name,
reinterpret_cast<unsigned char *>(data),
shape,
DataTypeToEnum<T>::value);
net_def->mutable_tensors().push_back(tensor);
}
template <DeviceType D, typename T>
void CheckOutputs(const NetDef &net_def,
const std::map<std::string, mace::MaceTensor> &inputs,
const std::map<std::string, mace::MaceTensor> &outputs) {
ops::test::OpsTestNet net;
for (auto input : inputs) {
auto input_shape = input.second.shape();
const int64_t data_size = std::accumulate(input_shape.begin(),
input_shape.end(), 1,
std::multiplies<int64_t>());
std::vector<float> input_data(data_size);
memcpy(input_data.data(), input.second.data().get(),
data_size * sizeof(float));
std::string input_name = MakeString("mace_input_node_",
input.first, ":0");
net.AddInputFromArray<D, float>(input_name, input.second.shape(),
input_data);
}
auto tensors = net_def.tensors();
for (auto tensor : tensors) {
auto shape = tensor.dims();
const int64_t data_size = std::accumulate(shape.begin(),
shape.end(), 1,
std::multiplies<int64_t>());
std::vector<T> data(data_size);
memcpy(data.data(), reinterpret_cast<const T *>(tensor.data()),
data_size * sizeof(T));
net.AddInputFromArray<D, T>(tensor.name(), shape, data);
}
net.RunNet(net_def, D);
for (auto output : outputs) {
std::unique_ptr<Tensor> tmp_tensor(
new Tensor(GetDeviceAllocator(DeviceType::CPU),
DataTypeToEnum<float>::v()));
auto output_shape = output.second.shape();
const int64_t data_size = std::accumulate(output_shape.begin(),
output_shape.end(), 1,
std::multiplies<float>());
tmp_tensor->Resize(output.second.shape());
float *data = tmp_tensor->mutable_data<float>();
memcpy(data, output.second.data().get(), data_size * sizeof(float));
std::string output_name = MakeString("mace_output_node_",
output.first, ":0");
ops::test::ExpectTensorNear<float>(*tmp_tensor,
*net.GetOutput(output_name.data()),
1e-5);
}
}
std::map<std::string, int> AddMemoryOptimization(
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::vector<int64_t>> &output_shapes,
NetDef *net_def) {
std::map<std::string, int> res;
int mem_id = 0;
size_t input_shape_size = input_shapes.size();
uint32_t in_mem_block_x = 0;
uint32_t in_mem_block_y = 0;
for (size_t i = 0; i < input_shape_size; ++i) {
in_mem_block_x = std::max<uint32_t>(in_mem_block_x,
input_shapes[i][2] *
RoundUpDiv4(input_shapes[i][3]));
in_mem_block_y = std::max<uint32_t>(in_mem_block_y,
input_shapes[i][0] *
input_shapes[i][1]);
}
size_t input_size = input_names.size();
for (size_t i = 0; i < input_size; ++i) {
net_def->mutable_mem_arena().mutable_mem_block().push_back(
MemoryBlock(mem_id, in_mem_block_x, in_mem_block_y));
res[input_names[i]] = mem_id;
mem_id++;
}
size_t output_shape_size = output_shapes.size();
uint32_t out_mem_block_x = 0;
uint32_t out_mem_block_y = 0;
for (size_t i = 0; i < output_shape_size; ++i) {
out_mem_block_x = std::max<uint32_t>(out_mem_block_x,
output_shapes[i][2] *
RoundUpDiv4(output_shapes[i][3]));
out_mem_block_y = std::max<uint32_t>(out_mem_block_y,
output_shapes[i][0] *
output_shapes[i][1]);
}
size_t output_size = output_names.size();
for (size_t i = 0; i < output_size; ++i) {
net_def->mutable_mem_arena().mutable_mem_block().push_back(
MemoryBlock(mem_id, out_mem_block_x, out_mem_block_y));
res[output_names[i]] = mem_id;
mem_id++;
}
return res;
}
// The height and width of input and output must be equal.
template <typename T>
void MaceRun(const int in_out_size,
const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::vector<int64_t>> &output_shapes,
const std::vector<int64_t> &filter_shape) {
std::vector<std::string> input_names;
std::vector<std::string> output_names;
for (int i = 0; i < in_out_size; ++i) {
input_names.push_back(MakeString("input", i));
output_names.push_back(MakeString("output", i));
}
std::string filter_tensor_name = "filter";
std::string filter_tensor_img_name = filter_tensor_name + "_image";
const DeviceType device = DeviceType::OPENCL;
NetDef net_def;
// Add memory optimization
auto mem_map = AddMemoryOptimization(input_names, output_names,
input_shapes, output_shapes,
&net_def);
std::vector<T> data;
ops::test::GenerateRandomRealTypeData<T>(filter_shape, &data);
AddTensor<T>(filter_tensor_name, filter_shape, data.data(), &net_def);
for (size_t i = 0; i < input_names.size(); ++i) {
std::string input_name = MakeString("mace_input_node_",
input_names[i], ":0");
BufferToImage<half>(input_name, input_names[i],
mace::kernels::IN_OUT_CHANNEL,
{mem_map[input_names[i]]},
&net_def);
}
BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
mace::kernels::CONV2D_FILTER, {},
&net_def, NetMode::INIT);
for (size_t i = 0; i < output_names.size(); ++i) {
Conv3x3<half>(input_names[i], filter_tensor_img_name,
output_names[i], {mem_map[output_names[i]]},
&net_def);
}
for (size_t i = 0; i < output_names.size(); ++i) {
std::string output_name = MakeString("mace_output_node_",
output_names[i], ":0");
ImageToBuffer<float>(output_names[i], output_name,
mace::kernels::IN_OUT_CHANNEL, &net_def);
}
MaceEngine engine(&net_def, device, input_names, output_names);
std::map<std::string, mace::MaceTensor> inputs;
std::map<std::string, mace::MaceTensor> outputs;
for (int i = 0; i < 5; ++i) {
size_t input_shape_size = input_shapes.size();
for (size_t j = 0; j < input_shape_size; ++j) {
inputs.clear();
outputs.clear();
GenerateInputs(input_names, input_shapes[j], &inputs);
GenerateOutputs(output_names, output_shapes[j], &outputs);
engine.Run(inputs, &outputs);
}
}
CheckOutputs<DeviceType::OPENCL, T>(net_def, inputs, outputs);
}
} // namespace
TEST_F(MaceAPITest, GPUSingleInputOutput) {
MaceRun<float>(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {3, 3, 16, 16});
MaceRun<half>(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {3, 3, 16, 16});
}
TEST_F(MaceAPITest, GPUMultipleInputOutput) {
MaceRun<float>(2,
{{1, 16, 32, 16}},
{{1, 16, 32, 16}},
{3, 3, 16, 16});
MaceRun<half>(2,
{{1, 16, 32, 16}},
{{1, 16, 32, 16}},
{3, 3, 16, 16});
}
TEST_F(MaceAPITest, GPUVariableInputShape) {
MaceRun<float>(1,
{{1, 16, 32, 16}, {1, 32, 64, 16}},
{{1, 16, 32, 16}, {1, 32, 64, 16}},
{3, 3, 16, 16});
MaceRun<float>(2,
{{1, 16, 32, 16}, {1, 32, 64, 16}},
{{1, 16, 32, 16}, {1, 32, 64, 16}},
{3, 3, 16, 16});
}
} // namespace test
} // namespace mace
...@@ -94,8 +94,8 @@ class Tuner { ...@@ -94,8 +94,8 @@ class Tuner {
Tuner &operator=(const Tuner &) = delete; Tuner &operator=(const Tuner &) = delete;
inline void WriteRunParameters() { inline void WriteRunParameters() {
VLOG(3) << "Write tuning result to " << path_;
if (path_ != nullptr) { if (path_ != nullptr) {
VLOG(3) << "Write tuning result to " << path_;
std::ofstream ofs(path_, std::ios::binary | std::ios::out); std::ofstream ofs(path_, std::ios::binary | std::ios::out);
if (ofs.is_open()) { if (ofs.is_open()) {
int64_t num_pramas = param_table_.size(); int64_t num_pramas = param_table_.size();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册