提交 eef80d7c 编写于 作者: Y yejianwu

merge with master

...@@ -23,3 +23,11 @@ config_setting( ...@@ -23,3 +23,11 @@ config_setting(
}, },
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
config_setting(
name = "is_profiling",
define_values = {
"profiling": "true",
},
visibility = ["//visibility:public"],
)
...@@ -7,7 +7,7 @@ package( ...@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android") load("//mace:mace.bzl", "if_android", "if_profiling")
cc_library( cc_library(
name = "opencl_runtime", name = "opencl_runtime",
...@@ -19,7 +19,7 @@ cc_library( ...@@ -19,7 +19,7 @@ cc_library(
"runtime/opencl/cl2.hpp", "runtime/opencl/cl2.hpp",
"runtime/opencl/*.h", "runtime/opencl/*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
deps = [ deps = [
":logging", ":logging",
"@opencl_headers//:opencl20_headers", "@opencl_headers//:opencl20_headers",
......
...@@ -1098,7 +1098,7 @@ namespace half_float ...@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor. /// Conversion constructor.
/// \param rhs float to convert /// \param rhs float to convert
explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {} half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
/// Conversion to single-precision. /// Conversion to single-precision.
/// \return single precision value representing expression value /// \return single precision value representing expression value
......
...@@ -13,6 +13,7 @@ namespace { ...@@ -13,6 +13,7 @@ namespace {
static cl_channel_type DataTypeToCLChannelType(const DataType t) { static cl_channel_type DataTypeToCLChannelType(const DataType t) {
switch (t) { switch (t) {
case DT_HALF: case DT_HALF:
return CL_HALF_FLOAT;
case DT_FLOAT: case DT_FLOAT:
return CL_FLOAT; return CL_FLOAT;
case DT_INT8: case DT_INT8:
...@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error; cl_int error;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(), new cl::Image2D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format, img_format,
image_shape[0], image_shape[1], image_shape[0], image_shape[1],
0, nullptr, &error); 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return cl_image; return cl_image;
} }
......
...@@ -6,6 +6,24 @@ ...@@ -6,6 +6,24 @@
namespace mace { namespace mace {
OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {}
OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
const DataType allowed) {
type_constraint_[attr_name] = allowed;
return *this;
}
const std::string OpKeyBuilder::Build() {
static const std::vector<std::string> type_order = {"T"};
std::string key = op_name_;
for (auto type : type_order) {
key += type + "_" + DataTypeToString(type_constraint_[type]);
}
return key;
}
std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() { std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
static std::map<int32_t, OperatorRegistry *> g_device_type_registry; static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
return &g_device_type_registry; return &g_device_type_registry;
...@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def, ...@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws, Workspace *ws,
DeviceType type) { DeviceType type) {
OperatorRegistry *registry = gDeviceTypeRegistry()->at(type); OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
return registry->Create(operator_def.type(), operator_def, ws); const int dtype = ArgumentHelper::GetSingleArgument<OperatorDef, int>(operator_def,
"T",
static_cast<int>(DT_FLOAT));
return registry->Create(OpKeyBuilder(operator_def.type().data())
.TypeConstraint("T", static_cast<DataType>(dtype))
.Build(),
operator_def,
ws);
} }
OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
......
...@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer { ...@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer {
} }
}; };
class OpKeyBuilder {
public:
explicit OpKeyBuilder(const char *op_name);
OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
template <typename T>
OpKeyBuilder &TypeConstraint(const char *attr_name);
const std::string Build();
private:
std::string op_name_;
std::map<std::string, DataType> type_constraint_;
};
template <typename T>
OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
}
#define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \ #define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \
namespace { \ namespace { \
static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \ static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \
......
...@@ -106,10 +106,10 @@ class Registerer { ...@@ -106,10 +106,10 @@ class Registerer {
} }
#define MACE_REGISTER_CREATOR(RegistryName, key, ...) \ #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__)
#define MACE_REGISTER_CLASS(RegistryName, key, ...) \ #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__)
} // namespace mace } // namespace mace
......
...@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() { ...@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() {
return; return;
} }
cl_command_queue_properties properties = 0;
#ifdef __ENABLE_PROFILING
enable_profiling_ = true;
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
#endif
// a context is like a "runtime link" to the device and platform; // a context is like a "runtime link" to the device and platform;
// i.e. communication is possible // i.e. communication is possible
cl::Context context({gpu_device}); cl::Context context({gpu_device});
cl_command_queue_properties properties = 0;
if (enable_profiling_) {
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
}
cl::CommandQueue command_queue(context, gpu_device, properties); cl::CommandQueue command_queue(context, gpu_device, properties);
instance = new OpenCLRuntime(context, gpu_device, command_queue); instance = new OpenCLRuntime(context, gpu_device, command_queue);
...@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() { ...@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() {
} }
cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() { cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first."); MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>(); return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
} }
cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() { cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first."); MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>(); return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
} }
...@@ -139,6 +141,7 @@ const std::map<std::string, std::string> ...@@ -139,6 +141,7 @@ const std::map<std::string, std::string>
OpenCLRuntime::program_map_ = { OpenCLRuntime::program_map_ = {
{"addn", "addn.cl"}, {"addn", "addn.cl"},
{"batch_norm", "batch_norm.cl"}, {"batch_norm", "batch_norm.cl"},
{"conv_2d", "conv_2d.cl"},
{"conv_2d_1x1", "conv_2d_1x1.cl"}, {"conv_2d_1x1", "conv_2d_1x1.cl"},
{"conv_2d_3x3", "conv_2d_3x3.cl"}, {"conv_2d_3x3", "conv_2d_3x3.cl"},
{"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
......
...@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) {
} }
} }
std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"},
{DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"},
{DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt];
}
size_t GetEnumTypeSize(const DataType dt) { size_t GetEnumTypeSize(const DataType dt) {
switch (dt) { switch (dt) {
......
...@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt); ...@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt);
size_t GetEnumTypeSize(const DataType dt); size_t GetEnumTypeSize(const DataType dt);
std::string DataTypeToString(const DataType dt);
template <class T> template <class T>
struct IsValidDataType; struct IsValidDataType;
......
...@@ -24,7 +24,7 @@ cc_library( ...@@ -24,7 +24,7 @@ cc_library(
"*.h", "*.h",
"hexagon/*.h", "hexagon/*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [ deps = [
"//mace/proto:cc_proto", "//mace/proto:cc_proto",
"//mace/core:core", "//mace/core:core",
...@@ -36,7 +36,7 @@ cc_test( ...@@ -36,7 +36,7 @@ cc_test(
name = "dsp_test", name = "dsp_test",
testonly = 1, testonly = 1,
srcs = glob(["*_test.cc"]), srcs = glob(["*_test.cc"]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([ linkopts = if_android([
"-ldl", "-ldl",
"-lm", "-lm",
...@@ -52,7 +52,7 @@ cc_test( ...@@ -52,7 +52,7 @@ cc_test(
name = "dsp_op_test", name = "dsp_op_test",
testonly = 1, testonly = 1,
srcs = glob(["test/*_test.cc"]), srcs = glob(["test/*_test.cc"]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([ linkopts = if_android([
"-ldl", "-ldl",
"-lm", "-lm",
...@@ -64,3 +64,21 @@ cc_test( ...@@ -64,3 +64,21 @@ cc_test(
"//mace/kernels:kernels", "//mace/kernels:kernels",
], ],
) )
cc_binary(
name = "mace_dsp_run",
srcs = [
"tool/mace_dsp_run.cc",
],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([
"-ldl",
"-lm",
]),
linkstatic = 1,
deps = [
":dsp",
"//mace/kernels:kernels",
"//mace/utils:command_line_flags",
],
)
\ No newline at end of file
...@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) { ...@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) {
} }
// input info // input info
const InputInfo& input_info = net_def.input_info()[0]; num_inputs_ = 0;
input_shape_.insert(input_shape_.begin(), for (const InputInfo &input_info: net_def.input_info()) {
input_info.dims().begin(), input_info.dims().end()); vector<index_t> input_shape;
while (input_shape_.size() < 4) { input_shape.insert(input_shape.begin(),
input_shape_.insert(input_shape_.begin(), 1); input_info.dims().begin(), input_info.dims().end());
while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1);
}
input_shapes_.push_back(input_shape);
input_data_types_.push_back(input_info.data_type());
num_inputs_ += 1;
} }
input_data_type_ = input_info.data_type();
// output info // output info
const OutputInfo& output_info = net_def.output_info()[0]; num_outputs_ = 0;
output_shape_.insert(output_shape_.begin(), for (const OutputInfo &output_info: net_def.output_info()) {
output_info.dims().begin(), output_info.dims().end()); vector<index_t> output_shape;
while (output_shape_.size() < 4) { output_shape.insert(output_shape.begin(),
output_shape_.insert(output_shape_.begin(), 1); output_info.dims().begin(), output_info.dims().end());
while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1);
}
output_shapes_.push_back(output_shape);
output_data_types_.push_back(output_info.data_type());
num_outputs_ += 1;
} }
output_data_type_ = output_info.data_type();
bool res = hexagon_nn_prepare(nn_id_) == 0; bool res = hexagon_nn_prepare(nn_id_) == 0;
return res; return res;
...@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() { ...@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME); hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
} }
bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
Tensor *output_tensor) {
LOG(INFO) << "Execute graph: " << nn_id_;
// single input and single output
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
"wrong output bytes inferred.");
}
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor) {
vector<Tensor> input_tensors(3);
vector<Tensor> output_tensors(3);
input_tensors[0].SetDtype(DT_UINT8);
output_tensors[0].SetDtype(DT_UINT8);
input_tensors[0].ResizeLike(input_tensor);
input_tensors[1].Resize({1, 1, 1, 1});
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
}
output_tensor->ResizeLike(output_tensors[0]);
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
return true;
}
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "mace/dsp/hexagon/hexagon_controller.h" #include "mace/dsp/hexagon/hexagon_controller.h"
#include "mace/dsp/hexagon_nn_ops.h" #include "mace/dsp/hexagon_nn_ops.h"
#include "mace/dsp/util/quantize.h"
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
...@@ -23,35 +24,10 @@ class HexagonControlWrapper { ...@@ -23,35 +24,10 @@ class HexagonControlWrapper {
bool Finalize(); bool Finalize();
bool SetupGraph(const NetDef& net_def); bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const std::string &model_file); bool SetupGraph(const std::string &model_file);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) { bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
LOG(INFO) << "Execute graph: " << nn_id_; bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
output_tensor->SetDtype(output_data_type_); vector<Tensor> *output_tensors);
output_tensor->Resize(output_shape_); bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shape_,
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool TeardownGraph(); bool TeardownGraph();
void PrintLog(); void PrintLog();
...@@ -70,11 +46,14 @@ class HexagonControlWrapper { ...@@ -70,11 +46,14 @@ class HexagonControlWrapper {
int nn_id_; int nn_id_;
Serializer serializer_; Serializer serializer_;
Quantizer quantizer_;
vector<index_t> input_shape_;
vector<index_t> output_shape_; vector<vector<index_t>> input_shapes_;
DataType input_data_type_; vector<vector<index_t>> output_shapes_;
DataType output_data_type_; vector<DataType> input_data_types_;
vector<DataType> output_data_types_;
uint32_t num_inputs_;
uint32_t num_outputs_;
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
}; };
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
using namespace mace; using namespace mace;
TEST(HexagonControlerWrapper, GetVersion) { TEST(HexagonControlerWrapper, InputFloat) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
HexagonControlWrapper wrapper; HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion(); VLOG(0) << "version: " << wrapper.GetVersion();
...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
wrapper.ResetPerfInfo(); wrapper.ResetPerfInfo();
timeval tv1, tv2; timeval tv1, tv2;
gettimeofday(&tv1, NULL); gettimeofday(&tv1, NULL);
int round = 2; int round = 10;
for (int i = 0; i < round; ++i) { for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
} }
...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
} }
std::cout << std::endl; std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
TEST(HexagonControlerWrapper, PreQuantize) {
testing::internal::LogToStderr();
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
wrapper.PrintGraph();
Tensor input_tensor;
Tensor output_tensor;
input_tensor.Resize({1, 480, 480, 3});
float *input_data = input_tensor.mutable_data<float>();
for (int i = 0; i < input_tensor.size(); ++i) {
input_data[i] = i % 256;
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
wrapper.GetPerfInfo();
wrapper.PrintLog();
const float *output_data = output_tensor.data<float>();
for (int i = 0; i < output_tensor.size(); ++i) {
std::cout << output_data[i] << " ";
}
std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph(); VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize(); wrapper.Finalize();
} }
\ No newline at end of file
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/dsp/hexagon_control_wrapper.h" #include "mace/dsp/hexagon_control_wrapper.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#define RESIZE_BILINEAR_TEST_CHANNELS 128
using namespace mace; using namespace mace;
static NetDef BuildNetDef() { static NetDef BuildNetDef() {
...@@ -17,7 +18,7 @@ static NetDef BuildNetDef() { ...@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
input_op->set_type("INPUT"); input_op->set_type("INPUT");
input_op->set_node_id(0); input_op->set_node_id(0);
input_op->set_padding(0); input_op->set_padding(0);
input_op->add_out_max_byte_size(1000); input_op->add_out_max_byte_size(1200);
// relu op // relu op
OperatorDef *resize_bilinear_op = net.add_op(); OperatorDef *resize_bilinear_op = net.add_op();
...@@ -45,7 +46,7 @@ static NetDef BuildNetDef() { ...@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
input_node_input = resize_bilinear_op->add_node_input(); input_node_input = resize_bilinear_op->add_node_input();
input_node_input->set_node_id(12); input_node_input->set_node_id(12);
input_node_input->set_output_port(0); input_node_input->set_output_port(0);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1200);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1000);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1000);
...@@ -64,8 +65,8 @@ static NetDef BuildNetDef() { ...@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
new_dim_tensor->add_dims(2); new_dim_tensor->add_dims(2);
new_dim_tensor->set_data_type(DataType::DT_INT32); new_dim_tensor->set_data_type(DataType::DT_INT32);
new_dim_tensor->set_node_id(10); new_dim_tensor->set_node_id(10);
new_dim_tensor->add_int32_data(1); new_dim_tensor->add_int32_data(2);
new_dim_tensor->add_int32_data(1); new_dim_tensor->add_int32_data(2);
TensorProto *input_min_tensor = net.add_tensors(); TensorProto *input_min_tensor = net.add_tensors();
input_min_tensor->set_name("input_min"); input_min_tensor->set_name("input_min");
...@@ -86,20 +87,20 @@ static NetDef BuildNetDef() { ...@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
input_info->set_name("input_node"); input_info->set_name("input_node");
input_info->set_node_id(0); input_info->set_node_id(0);
input_info->add_dims(1); input_info->add_dims(1);
input_info->add_dims(2); input_info->add_dims(3);
input_info->add_dims(2); input_info->add_dims(3);
input_info->add_dims(128); input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
input_info->set_data_type(DataType::DT_UINT8); input_info->set_data_type(DataType::DT_UINT8);
input_info->set_max_byte_size(1000); input_info->set_max_byte_size(1200);
OutputInfo *output_info = net.add_output_info(); OutputInfo *output_info = net.add_output_info();
output_info->set_name("output_node"); output_info->set_name("output_node");
output_info->set_node_id(1); output_info->set_node_id(1);
output_info->add_dims(1); output_info->add_dims(1);
output_info->add_dims(1); output_info->add_dims(2);
output_info->add_dims(1); output_info->add_dims(2);
output_info->add_dims(128); output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
output_info->set_data_type(DataType::DT_UINT8); output_info->set_data_type(DataType::DT_UINT8);
output_info->set_max_byte_size(1000); output_info->set_max_byte_size(1200);
return net; return net;
} }
...@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) { ...@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU); Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU);
Tensor input_tensor(cpu_allocator, DT_UINT8); Tensor input_tensor(cpu_allocator, DT_UINT8);
Tensor output_tensor(cpu_allocator, DT_UINT8); Tensor output_tensor(cpu_allocator, DT_UINT8);
input_tensor.Resize({1, 2, 2, 128}); input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS});
output_tensor.Resize({1, 1, 1, 128}); output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS});
uint8_t *input_data = input_tensor.mutable_data<uint8_t>(); uint8_t *input_data = input_tensor.mutable_data<uint8_t>();
const uint8_t *output_data = output_tensor.data<uint8_t>(); const uint8_t *output_data = output_tensor.data<uint8_t>();
for (int c = 0; c < 128; ++c) { for (int wh = 0; wh < 9; ++wh) {
input_data[c] = input_data[c + 128] = input_data[c + 256] for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) {
= input_data[c + 384] = (uint8_t)c; input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh;
}
} }
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
wrapper.PrintLog(); wrapper.PrintLog();
for (int i = 0; i < output_tensor.size(); ++i) { vector<uint8_t> expected {9, 8, 5, 3};
EXPECT_EQ(i, output_data[i]); for (int i = 0; i < 4; ++i) {
for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c)
EXPECT_EQ(expected[i],
output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]);
} }
std::cout << std::endl; std::cout << std::endl;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
/**
* Usage:
* mace_dsp_run --model=mobi_mace.pb \
* --input_shape=1,3,224,224 \
* --input_file=input_data \
* --output_file=mace.out
*/
#include <sys/time.h>
#include <fstream>
#include "mace/dsp/hexagon_control_wrapper.h"
#include "mace/core/net.h"
#include "mace/utils/command_line_flags.h"
using namespace std;
using namespace mace;
void ParseShape(const string &str, vector<index_t> *shape) {
string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
int main(int argc, char **argv) {
string model_file;
string input_shape;
string input_file;
string output_file;
int round = 1;
std::vector<Flag> flag_list = {
Flag("model", &model_file, "model file name"),
Flag("input_shape", &input_shape, "input shape, separated by comma"),
Flag("input_file", &input_file, "input file name"),
Flag("output_file", &output_file, "output file name"),
Flag("round", &round, "round"),
};
string usage = Flags::Usage(argv[0], flag_list);
const bool parse_result = Flags::Parse(&argc, argv, flag_list);
if (!parse_result) {
LOG(ERROR) << usage;
return -1;
}
VLOG(0) << "model: " << model_file << std::endl
<< "input_shape: " << input_shape << std::endl
<< "input_file: " << input_file << std::endl
<< "output_file: " << output_file << std::endl
<< "round: " << round << std::endl;
vector<index_t> shape;
ParseShape(input_shape, &shape);
// load input
Tensor input_tensor;
input_tensor.Resize(shape);
float *input_data = input_tensor.mutable_data<float>();
ifstream in_file(input_file, ios::in | ios::binary);
in_file.read(reinterpret_cast<char *>(input_data),
input_tensor.size() * sizeof(float));
in_file.close();
// execute
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph(model_file);
wrapper.PrintGraph();
Tensor output_tensor;
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
cout << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round
<< endl;
wrapper.GetPerfInfo();
wrapper.PrintLog();
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
// save output
ofstream out_file(output_file, ios::binary);
out_file.write((const char *) (output_tensor.data<float>()),
output_tensor.size() * sizeof(float));
out_file.flush();
out_file.close();
}
\ No newline at end of file
...@@ -20,7 +20,7 @@ cc_library( ...@@ -20,7 +20,7 @@ cc_library(
hdrs = glob([ hdrs = glob([
"*.h", "*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [ deps = [
"//mace/core:core", "//mace/core:core",
], ],
......
...@@ -10,15 +10,23 @@ ...@@ -10,15 +10,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<DeviceType D, typename T> struct AddNFunctorBase {};
struct AddNFunctor {
void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size(); index_t size = input_tensors[0]->size();
T *output_ptr = output_tensor->mutable_data<T>(); T *output_ptr = output_tensor->mutable_data<T>();
memset(output_ptr, 0, size * sizeof(T)); memset(output_ptr, 0, size * sizeof(T));
int n = input_tensors.size(); int n = input_tensors.size();
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
Tensor::MappingGuard input_map(input_tensors[i]); Tensor::MappingGuard input_map(input_tensors[i]);
const T *input_ptr = input_tensors[i]->data<T>(); const T *input_ptr = input_tensors[i]->data<T>();
for (index_t j = 0; j < size; ++j) { for (index_t j = 0; j < size; ++j) {
...@@ -28,15 +36,17 @@ struct AddNFunctor { ...@@ -28,15 +36,17 @@ struct AddNFunctor {
} }
}; };
template<> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor); const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()( struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
std::vector<const Tensor *> &inputs, Tensor *output); void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_ADDN_H_ #endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
...@@ -28,9 +28,10 @@ struct BatchNormFunctor { ...@@ -28,9 +28,10 @@ struct BatchNormFunctor {
// new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
// new_offset = \offset - mean * common_val; // new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset; // Y = new_scale * X + new_offset;
const index_t n = input->dim(0); const index_t batch = input->dim(0);
const index_t channel = input->dim(1); const index_t height = input->dim(1);
const index_t sample_size = input->dim(2) * input->dim(3); const index_t width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard scale_mapper(scale); Tensor::MappingGuard scale_mapper(scale);
...@@ -48,19 +49,26 @@ struct BatchNormFunctor { ...@@ -48,19 +49,26 @@ struct BatchNormFunctor {
const T *epsilon_ptr = epsilon->data<T>(); const T *epsilon_ptr = epsilon->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
vector<T> new_scale(channels);
vector<T> new_offset(channels);
#pragma omp parallel for #pragma omp parallel for
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channels; ++c) {
T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale; new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
index_t pos = c * sample_size; }
index_t pos = 0;
for (index_t i = 0; i < n; ++i) { #pragma omp parallel for
const T *input_sample_ptr = input_ptr + pos; for (index_t n = 0; n < batch; ++n) {
T *output_sample_ptr = output_ptr + pos; for (index_t h = 0; h < height; ++h) {
for (index_t j = 0; j < sample_size; ++j) { for (index_t w = 0; w < width; ++w) {
output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset; for (index_t c = 0; c < channels; ++c) {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
++pos;
}
} }
pos += channel * sample_size;
} }
} }
} }
...@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()( ...@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output); Tensor *output);
template <> template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( struct BatchNormFunctor<DeviceType::OPENCL, T> {
const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
const Tensor *mean, const Tensor *mean,
const Tensor *var, const Tensor *var,
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output); Tensor *output);
};
} // namepsace kernels } // namepsace kernels
} // namespace mace } // namespace mace
......
...@@ -11,13 +11,23 @@ ...@@ -11,13 +11,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct Conv2dFunctor { struct Conv2dFunctor : Conv2dFunctorBase {
Conv2dFunctor() {}
Conv2dFunctor(const int *strides, Conv2dFunctor(const int *strides,
const Padding &paddings, const Padding &paddings,
const int *dilations) const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {} : Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -76,9 +86,10 @@ struct Conv2dFunctor { ...@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0; T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c];
*output_data = bias_channel; *output_data = bias_channel;
T sum = 0; T sum = 0.0f;
const T *filter_ptr = filter_data + c; const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
...@@ -113,9 +124,6 @@ struct Conv2dFunctor { ...@@ -113,9 +124,6 @@ struct Conv2dFunctor {
} }
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
}; };
template<> template<>
...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output); Tensor *output);
template<>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, template<typename T>
const Tensor *filter, struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const Tensor *bias, Conv2dFunctor(const int *strides,
Tensor *output); const Padding &paddings,
const int *dilations)
: Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_FUSED_CONV_2D_H_
#define MACE_KERNELS_FUSED_CONV_2D_H_
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/conv_2d.h"
namespace mace {
namespace kernels {
struct FusedConv2dFunctorBase {
FusedConv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T>
struct FusedConv2dFunctor : FusedConv2dFunctorBase {
FusedConv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: FusedConv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
T *output_data = output->mutable_data<T>();
T zero_value;
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
zero_value = half_float::half_cast<half>(0.0f);
} else {
zero_value = 0;
}
auto output_size = output->size();
for (int n = 0; n < output_size; ++n) {
*output_data = *output_data < 0 ? zero_value : *output_data;
output_data++;
}
}
};
template<typename T>
struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
FusedConv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: FusedConv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_FUSED_CONV_2D_H_
...@@ -10,7 +10,7 @@ namespace kernels { ...@@ -10,7 +10,7 @@ namespace kernels {
template <> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
// TODO: neon mem copy // TODO: neon mem copy
index_t size = output_tensor->size(); index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>(); float *output_ptr = output_tensor->mutable_data<float>();
...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()( ...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
filter_shape[0] = input_tensor->shape()[1];
filter_shape[1] = input_tensor->shape()[1];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, output_shape.data(),
paddings.data());
output_tensor->Resize(output_shape);
const float *input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
Tensor padded_input; Tensor padded_input;
ConstructInputWithPadding(input_tensor, paddings, &padded_input); ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
input = padded_input.data<float>(); input = padded_input.data<float>();
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
#endif #endif
...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 2x2, strides: 2x2 // kernel_size: 2x2, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_2x2s2x2 } else { // AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 3x3, strides: 2x2 // kernel_size: 3x3, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_3x3s2x2 } else { // AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else { // not implement yet } else { // not implement yet
......
...@@ -5,52 +5,83 @@ ...@@ -5,52 +5,83 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { template <typename T>
index_t element_size = input0->NumElements(); static void AddN(const std::vector<const Tensor *> &input_tensors,
index_t blocks = (element_size + 3) / 4; Tensor *output) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const uint32_t gws = blocks; const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype())); auto dt = DataTypeToEnum<T>::value;
auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0; uint32_t idx = 0;
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer()))); for (auto input : input_tensors) {
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer()))); addn_kernel.setArg(idx++,
addn_kernel.setArg(idx++, static_cast<int32_t>(element_size)); *(static_cast<const cl::Image2D *>(input->buffer())));
addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); }
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange, addn_kernel, cl::NullRange,
cl::NDRange(gws), cl::NDRange(width_pixels, batch_height_pixels),
cl::NDRange(lws), cl::NDRange(64, 16), // TODO fix this
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
} }
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors, void AddNFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
if (input_tensors.empty() || input_tensors.front() == nullptr) {
return;
}
size_t size = input_tensors.size(); size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
switch (size) { for (int i = 1; i < size; ++i) {
case 2:Add2(input_tensors[0], input_tensors[1], output_tensor); MACE_CHECK_NOTNULL(input_tensors[i]);
break; MACE_CHECK(batch == input_tensors[i]->dim(0));
default:MACE_NOT_IMPLEMENTED; MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
}; };
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <> template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
...@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output) { Tensor *output) {
index_t pixel_size = input->dim(2) * input->dim(3); const index_t batch = input->dim(0);
index_t blocks = (pixel_size + 3) / 4; const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const uint32_t gws[3] = {static_cast<uint32_t>(input->dim(0)), const index_t channel_blocks = RoundUpDiv4(channels);
static_cast<uint32_t>(input->dim(1)),
static_cast<uint32_t>(blocks)}; const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
const std::vector<uint32_t> lws = {1, 1, kwg_size}; const std::vector<uint32_t> lws = {1, kwg_size, 1};
uint32_t idx = 0; uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(scale->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(scale->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(offset->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(offset->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(mean->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(var->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
bm_kernel.setArg(idx++, static_cast<uint32_t>(pixel_size)); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> { auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
return {{1, 1, 64}, return {{8, 128, 1}, //SNPE size
{1, 1, 64},
{1, 1, 128}, {1, 1, 128},
{1, kwg_size/16, 16}, {1, kwg_size/16, 16},
{1, kwg_size/32, 32}, {1, kwg_size/32, 32},
...@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
func); func);
} }
template
struct BatchNormFunctor<DeviceType::OPENCL, float>;
template
struct BatchNormFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} }
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
string kernel_name; string kernel_name;
switch (type) { switch (type) {
......
#include <common.h> #include <common.h>
// Supported data type: half/float __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__kernel void add2(__global const DATA_TYPE *input0, __read_only image2d_t input1,
__global const DATA_TYPE *input1, #if INPUT_NUM > 2
__private const int size, __read_only image2d_t input2,
__global DATA_TYPE *output) { #endif
int idx = get_global_id(0); #if INPUT_NUM > 3
__read_only image2d_t input3,
#endif
__write_only image2d_t output) {
const int w = get_global_id(0);
const int hb = get_global_id(1);
if (idx + 4 > size) { const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for(; idx < size; ++idx) {
*(output+idx) = *(input0+idx) + *(input1+idx); DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
} DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
} else { DATA_TYPE4 out = in0 + in1;
VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1); #if INPUT_NUM > 2
vstore4(in_data0+in_data1, idx, output); DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
} out = out + in2;
#endif
#if INPUT_NUM > 3
DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
out = out + in3;
#endif
WRITE_IMAGET(output, (int2)(w, hb), out);
} }
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
void kernel batch_norm(global const DATA_TYPE *input, __kernel void batch_norm(__read_only image2d_t input,
global const DATA_TYPE *scale, __read_only image2d_t scale,
global const DATA_TYPE *offset, __read_only image2d_t offset,
global const DATA_TYPE *mean, __read_only image2d_t mean,
global const DATA_TYPE *var, __read_only image2d_t var,
global const DATA_TYPE *epsilon, __global const DATA_TYPE *epsilon,
private const int pixels, __write_only image2d_t output) {
global DATA_TYPE *output, const int ch_blk = get_global_id(0);
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale, const int w = get_global_id(1);
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) { const int hb = get_global_id(2);
const int batch = get_global_id(0); const int width = get_global_size(1);
const int channel = get_global_id(1);
const int channels = get_global_size(1);
const int pixel_offset = get_global_id(2);
const int local_channel = get_local_id(1);
const int local_pixel_idx = get_local_id(2);
if(local_pixel_idx == 0) { DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon)); DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x); DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
} DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
barrier(CLK_LOCAL_MEM_FENCE); DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; const int pos = ch_blk * width + w;
const DATA_TYPE *input_ptr = input + image_offset;
DATA_TYPE *output_ptr = output + image_offset;
const int end = (batch * channels + channel + 1) * pixels;
if ((image_offset+4) > end) {
for (int i = image_offset; i < end; ++i) {
*output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x;
++input_ptr;
++output_ptr;
}
} else {
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
values = values * new_scale[local_channel] + new_offset[local_channel];
vstore4(values, 0, output_ptr);
}
}
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = in * new_scale + new_offset;
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
...@@ -14,4 +14,11 @@ ...@@ -14,4 +14,11 @@
#define CMD_TYPE_STR(cmd, type) cmd##type #define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type) #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
#include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks,
__private const int out_height,
__private const int out_width,
__private const int filter_height,
__private const int filter_width,
__private const int padding_top,
__private const int padding_left) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
DATA_TYPE4 out0 =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif
const int batch_idx = (out_hb / out_height) * in_height;
DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
}
}
}
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * out_width;
int w = out_w_blk;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out0);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out1);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out2);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out3);
}
#include <common.h> #include <common.h>
#define vec_conv_2d_1x1_s1 \
VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr); \
VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
#define vec_conv_2d_1x1_s2 \
VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr); \
VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
#define vec_conv_2d_1x1_compute_loop \
for (int oc = 0; oc < 4; ++oc) { \
VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel); \
out += in0 * weights.x; \
out += in1 * weights.y; \
out += in2 * weights.z; \
out += in3 * weights.w; \
vstore4(out, 0, output_ptr + oc * out_pixel); \
}
#define vec_conv_2d_1x1_compute \
VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr); \
out += in0 * weights.x; \
out += in1 * weights.y; \
out += in2 * weights.z; \
out += in3 * weights.w; \
vstore4(out, 0, output_ptr);
// Supported data type: half/float
__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
__global const DATA_TYPE *filter, /* o, i, kh, kw */
#ifdef BIAS
__global const DATA_TYPE *bias, /* o */
#endif /* defined(BIAS) */
__global DATA_TYPE *output, /* n, c, h, w */
__private const int in_chan_num,
__private const int out_chan_num,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
#ifdef STRIDE_1
const int stride = 1;
#else
const int stride = 2;
#endif
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
int out_chan_len = out_chan_end - out_chan_begin;
int pixel_len = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef BIAS
DATA_TYPE bias_value = bias[out_chan];
#else
DATA_TYPE bias_value = 0;
#endif
for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = bias_value;
}
}
int in_chan = 0;
if (pixel_len == 4) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
int out_chan = out_chan_begin;
for (; out_chan + 3 < out_chan_end; out_chan += 4) {
const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef STRIDE_1
vec_conv_2d_1x1_s1;
#else
vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute_loop;
}
for (; out_chan < out_chan_end; ++out_chan) {
const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef STRIDE_1
vec_conv_2d_1x1_s1;
#else
vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute;
}
}
}
for (; in_chan < in_chan_num; ++in_chan) {
const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
for (int p = 0; p < pixel_len; ++p) {
float in = input_ptr[p*stride];
output_ptr[p] += in * weights;
}
}
}
}
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks, __private const int in_ch_blks,
__private const int height,
__private const int width) { __private const int width) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
...@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0)); #ifdef BIAS
half4 out[4]; DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[0] = (half4)(bias_value.x); DATA_TYPE4 out1 = out0;
out[1] = (half4)(bias_value.y); DATA_TYPE4 out2 = out0;
out[2] = (half4)(bias_value.z); DATA_TYPE4 out3 = out0;
out[3] = (half4)(bias_value.w); #else
DATA_TYPE4 out0 = 0;
int w[4]; DATA_TYPE4 out1 = 0;
w[0] = out_w_blk; DATA_TYPE4 out2 = 0;
w[1] = w[0] + out_w_blks; DATA_TYPE4 out3 = 0;
w[2] = w[1] + out_w_blks; #endif
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[4];
in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
if (w[1] < width) {
// conditional load hurt perf, this branching helps sometimes
in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
}
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
half4 weights[4];
#pragma unroll
for (int c = 0; c < 4; ++c) {
weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int c = 0; c < 4; ++c) {
out[c] += in[c].x * weights[0];
out[c] += in[c].y * weights[1];
out[c] += in[c].z * weights[2];
out[c] += in[c].w * weights[3];
}
in_x_base += width;
}
const int out_x_base = out_ch_blk * width;
write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
if (w[1] >= width) return;
write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
if (w[2] >= width) return;
write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
if (w[3] >= width) return;
write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
}
__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
__read_only image2d_t filter, /* cout%8 * cin, cout/8 */
__read_only image2d_t bias, /* cout%8 * cout/8 */
__write_only image2d_t output,
__private const int in_ch_blks,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int4 w;
#if STRIDE == 1
w.x = out_w_blk;
w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height);
#else
w.x = out_w_blk * 2;
w.y = (out_w_blk + out_w_blks) * 2;
w.z = (out_w_blk + 2 * out_w_blks) * 2;
w.w = (out_w_blk + 3 * out_w_blks) * 2;
int out_hb_idx = (out_hb % height) * 2;
#endif
float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0)); w.x = select(w.x, INT_MIN, w.x >= in_width);
half4 bias_value03 = as_half4(bias_value.xy); w.y = select(w.y, INT_MIN, w.y >= in_width);
half4 bias_value47 = as_half4(bias_value.zw); w.z = select(w.z, INT_MIN, w.z >= in_width);
half4 out[8]; w.w = select(w.w, INT_MIN, w.w >= in_width);
out[0] = (half4)(bias_value03.x);
out[1] = (half4)(bias_value03.y);
out[2] = (half4)(bias_value03.z);
out[3] = (half4)(bias_value03.w);
out[4] = (half4)(bias_value47.x);
out[5] = (half4)(bias_value47.y);
out[6] = (half4)(bias_value47.z);
out[7] = (half4)(bias_value47.w);
int w[4]; out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
w[0] = out_w_blk; -1,
w[1] = w[0] + out_w_blks; out_hb_idx >= in_height);
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int in_x_base = 0; int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
in[wi << 1] = as_half4(in_value.xy);
in[wi << 1 + 1] = as_half4(in_value.zw);
}
// The order matters, load input first then load filter, why? DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2; const int filter_x0 = in_ch_blk << 2;
half4 weights[8]; DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
#pragma unroll DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
for (int wi = 0; wi < 4; ++wi) { DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk)); DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
weights[wi << 1] = as_half4(weights_value.xy);
weights[wi << 1 + 1] = as_half4(weights_value.zw);
}
// Will prefetch L2 improve performance? How to pretch image data? // Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected out0 += in0.x * weights0;
#pragma unroll out0 += in0.y * weights1;
for (int wi = 0; wi < 4; ++wi) { out0 += in0.z * weights2;
int idx = wi << 1; out0 += in0.w * weights3;
out[idx] += in[idx].x * weights[0];
out[idx] += in[idx].y * weights[1]; out1 += in1.x * weights0;
out[idx] += in[idx].z * weights[2]; out1 += in1.y * weights1;
out[idx] += in[idx].w * weights[3]; out1 += in1.z * weights2;
out1 += in1.w * weights3;
++idx; out2 += in2.x * weights0;
out[idx] += in[idx].x * weights[4]; out2 += in2.y * weights1;
out[idx] += in[idx].y * weights[5]; out2 += in2.z * weights2;
out[idx] += in[idx].z * weights[6]; out2 += in2.w * weights3;
out[idx] += in[idx].w * weights[7];
}
in_x_base += width; out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += in_width;
} }
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * width; const int out_x_base = out_ch_blk * width;
float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1])); int out_x_idx = out_w_blk;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
if (w[1] >= width) return; out_x_idx += out_w_blks;
out_value = (float4)(as_float2(out[2]), as_float2(out[3])); if (out_x_idx >= width) return;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
if (w[2] >= width) return; out_x_idx += out_w_blks;
out_value = (float4)(as_float2(out[4]), as_float2(out[5])); if (out_x_idx >= width) return;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
if (w[3] >= width) return;
out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
} }
...@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int in_channels, __private const int in_ch_blks,
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width,
__private const int padding_top, __private const int padding_top,
...@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
const int in_ch_blks = (in_channels + 3) / 4;
const int rounded_in_ch = in_ch_blks * 4; const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
#ifdef BIAS #ifdef BIAS
out[0] = DATA_TYPE4 out0 =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0]; DATA_TYPE4 out1 = out0;
out[2] = out[0]; DATA_TYPE4 out2 = out0;
out[3] = out[0]; DATA_TYPE4 out3 = out0;
DATA_TYPE4 out4 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
DATA_TYPE4 out4 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif #endif
int w[4]; const int batch_idx = (out_hb / out_height) * in_height;
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
int in_hb[3];
in_hb[0] = height_idx - padding_top;
in_hb[1] = in_hb[0] + 1;
in_hb[2] = in_hb[1] + 1;
// Judge the height border for padding input.
in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int idx = 0; for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36]; int in_hb_value = height_idx + hb_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36]; in_hb_value = select(in_hb_value + batch_idx,
-1,
int filter_idx = in_ch_blk << 2; (in_hb_value < 0 || in_hb_value >= in_height));
int in_idx = in_ch_blk * in_width; for (short width_idx = 0; width_idx < 3; ++width_idx) {
#pragma unroll in_idx = in_ch_blk * in_width;
for (int i = 0; i < 3; ++i) { int in_width_value;
for (int j = 0; j < 3; ++j) { #define READ_INPUT(i) \
idx = i * 12 + j * 4; in_width_value = in_width##i + width_idx; \
int in_width_idx = w[0] + j; in_width_value = select(in_idx + in_width_value, \
// Judge the width border for padding input. -1, \
if (in_width_idx < 0 || in_width_idx >= in_width) { (in_width_value < 0 || in_width_value >= in_width)); \
in[idx + 0] = 0; in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); READ_INPUT(0);
} READ_INPUT(1);
in_width_idx = w[1] + j; READ_INPUT(2);
if (in_width_idx < 0 || in_width_idx >= in_width) { READ_INPUT(3);
in[idx + 1] = 0; READ_INPUT(4);
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); #undef READ_INPUT
}
in_width_idx = w[2] + j; int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
if (in_width_idx < 0 || in_width_idx >= in_width) { weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
in[idx + 2] = 0; weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
} else { weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
}
in_width_idx = w[3] + j; // Will prefetch L2 improve performance? How to pretch image data?
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 3] = 0; // Interleaving load and mul does not improve performance as expected
} else { out0 += in0.x * weights0;
in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); out0 += in0.y * weights1;
} out0 += in0.z * weights2;
out0 += in0.w * weights3;
weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); out1 += in1.x * weights0;
weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); out1 += in1.y * weights1;
weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); out1 += in1.z * weights2;
out1 += in1.w * weights3;
filter_idx += rounded_in_ch;
} out2 += in2.x * weights0;
} out2 += in2.y * weights1;
// Will prefetch L2 improve performance? How to pretch image data? out2 += in2.z * weights2;
out2 += in2.w * weights3;
// Interleaving load and mul does not improve performance as expected
#pragma unroll out3 += in3.x * weights0;
for (int c = 0; c < 4; ++c) { out3 += in3.y * weights1;
for (int i = 0; i < 9; ++i) { out3 += in3.z * weights2;
out[c] += in[c + i * 4].x * weights[0 + i * 4]; out3 += in3.w * weights3;
out[c] += in[c + i * 4].y * weights[1 + i * 4];
out[c] += in[c + i * 4].z * weights[2 + i * 4]; out4 += in4.x * weights0;
out[c] += in[c + i * 4].w * weights[3 + i * 4]; out4 += in4.y * weights1;
out4 += in4.z * weights2;
out4 += in4.w * weights3;
} }
} }
} }
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
out4 = fmax(out4, 0);
#endif
const int out_x_base = out_ch_blk * out_width; const int out_x_base = out_ch_blk * out_width;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, int w = out_w_blk;
(int2)(out_x_base + w[0] + padding_left, out_hb), WRITE_IMAGET(output,
out[0]); (int2)(out_x_base + w, out_hb),
out0);
w[1] += padding_left;
if (w[1] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[1], out_hb), WRITE_IMAGET(output,
out[1]); (int2)(out_x_base + w, out_hb),
out1);
w[2] += padding_left;
if (w[2] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[2], out_hb), WRITE_IMAGET(output,
out[2]); (int2)(out_x_base + w, out_hb),
out2);
w[3] += padding_left;
if (w[3] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[3], out_hb), WRITE_IMAGET(output,
out[3]); (int2)(out_x_base + w, out_hb),
out3);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out4);
} }
#include <common.h> #include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { #ifdef FP16
VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); #define MIN_VALUE -USHRT_MAX
VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
DATA_TYPE row01 = *(input_ptr + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
DATA_TYPE row11 = *(input_ptr + in_width + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
return res;
}
// Supported data type: half/float
__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__global DATA_TYPE *output) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
VEC_DATA_TYPE(DATA_TYPE,4) res;
#ifdef STRIDE_1
res = vec_pooling_3_s1(input_ptr, in_width);
#else #else
res = vec_pooling_3_s2(input_ptr, in_width); #define MIN_VALUE -FLT_MAX
#endif #endif
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = inner_pooling_3(input_ptr, in_width);
input_ptr += stride;
}
}
}
}
int calculate_avg_block_size(const int pos_h, inline int calculate_avg_block_size(const int pool_size,
const int pos_w, const int pos_h,
const int pool_size, const int pos_w,
const int pad_h, const int h_size,
const int pad_w, const int w_size) {
const int h_size, const int h_start = max(0, pos_h);
const int w_size) { const int w_start = max(0, pos_w);
const int h_start = max(0, pos_h - pad_h); const int h_end = min(pos_h + pool_size, h_size);
const int w_start = max(0, pos_w - pad_w); const int w_end = min(pos_w + pool_size, w_size);
const int h_end = min(pos_h + pool_size - pad_h, h_size);
const int w_end = min(pos_w + pool_size - pad_w, w_size);
return (h_end - h_start) * (w_end - w_start); return (h_end - h_start) * (w_end - w_start);
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ __kernel void pooling(__read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_chan_num, __private const int out_height,
__private const int out_height, __private const int pad_top,
__private const int out_width, __private const int pad_left,
__private const int stride, __private const int stride,
__private const int pad_h, __private const int pooling_size,
__private const int pad_w, __write_only image2d_t output) {
__private const int pooling_size, const int out_chan_idx = get_global_id(0);
__global DATA_TYPE *output) { const int out_width_idx = get_global_id(1);
int batch = get_global_id(0); const int out_width = get_global_size(1);
int out_chan_idx = get_global_id(1); const int out_hb_idx = get_global_id(2);
int out_pixel_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height;
const int out_pixel_height = out_pixel_idx / out_width; const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
const int out_pixel_width = out_pixel_idx % out_width; const int in_width_start = out_width_idx * stride - pad_left;
const int in_channel_offset = out_chan_idx * in_width;
const int out_chan_begin = out_chan_idx * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int in_pixel_idx = out_pixel_height * stride * in_width #ifdef POOL_AVG
+ out_pixel_width * stride; DATA_TYPE4 res = 0;
for (int height = 0; height < pooling_size; ++height) {
const int in_pixel = in_height * in_width; int in_height_idx = in_height_start + height;
const int out_pixel = out_height * out_width; in_height_idx = select(batch_idx + in_height_idx,
-1,
const int in_offset = batch * out_chan_num * in_pixel; (in_height_idx < 0 || in_height_idx >= in_height));
const int out_offset = batch * out_chan_num * out_pixel; for (int width = 0; width < pooling_size; ++width) {
const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; int in_width_idx = in_width_start + width;
DATA_TYPE *output_base = output + out_offset + out_pixel_idx; in_width_idx = select(in_channel_offset + in_width_idx,
-1,
const int block_size = calculate_avg_block_size( (in_width_idx < 0 || in_width_idx >= in_width));
out_pixel_height * stride,
out_pixel_width * stride, DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
pooling_size, res = res + in;
pad_h/2, }
pad_w/2, }
in_height - pad_h, const int block_size = calculate_avg_block_size(pooling_size,
in_width - pad_w); in_height_start, in_width_start,
for (int i = out_chan_begin; i < out_chan_end; ++i) { in_height, in_width);
VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; res /= block_size;
DATA_TYPE sum1 = 0.0f; #else
DATA_TYPE *output_ptr = output_base + i * out_pixel; DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
for (int y = 0; y < pooling_size; ++y) { for (int height = 0; height < pooling_size; ++height) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; int in_height_idx = in_height_start + height;
int x = 0; in_height_idx = select(batch_idx + in_height_idx,
for (; x < (pooling_size-8); x += 8) { -1,
VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); (in_height_idx < 0 || in_height_idx >= in_height));
sum8 += data; if (in_height_idx != -1) {
input_ptr += 8; for (int width = 0; width < pooling_size; ++width) {
} int in_width_idx = in_width_start + width;
for (; x < pooling_size; ++x) { in_width_idx = select(in_channel_offset + in_width_idx,
sum1 += *input_ptr; -1,
input_ptr++; (in_width_idx < 0 || in_width_idx >= in_width));
if (in_width_idx != -1) {
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
res = fmax(res, in);
}
} }
} }
VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
} }
#endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
} }
#include <common.h> #include <common.h>
// Supported data type: half/float __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */ __write_only image2d_t output,
__global DATA_TYPE *output /* n * c, h, w */,
__private const float height_scale, __private const float height_scale,
__private const float width_scale, __private const float width_scale,
__private const int in_height, __private const int in_height,
__private const int in_width) { __private const int in_width,
const int c = get_global_id(0); __private const int out_height) {
const int h = get_global_id(1); const int ch_blk = get_global_id(0);
const int w = get_global_id(2); const int ch_blks = get_global_size(0);
const int channels = get_global_size(0); const int w = get_global_id(1);
const int height = get_global_size(1); const int out_width = get_global_size(1);
const int width = get_global_size(2); const int hb = get_global_id(2);
const int b = hb / out_height;
const int h = hb % out_height;
const float h_in = h * height_scale; const float h_in = h * height_scale;
const float w_in = w * width_scale; const float w_in = w * width_scale;
...@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, ...@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
const float h_lerp = h_in - h_lower; const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower; const float w_lerp = w_in - w_lower;
const DATA_TYPE *input_base = input + c * in_height * in_width; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
DATA_TYPE *output_base = output + c * height * width; const int in_w_offset = ch_blk * in_width;
const int in_h_offset = b * in_height;
DATA_TYPE top_left = input_base[h_lower * in_width + w_lower]; DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
DATA_TYPE top_right = input_base[h_lower * in_width + w_upper]; (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower]; DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper]; (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
(int2)(in_w_offset + w_lower, in_h_offset + h_upper));
DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
(int2)(in_w_offset + w_upper, in_h_offset + h_upper));
const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp; DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
output_base[h * width + w] = top + (bottom - top) * h_lerp;
DATA_TYPE4 out = top + (bottom - top) * h_lerp;
const int out_w_offset = ch_blk * out_width;
const int out_h_offset = b * out_height;
WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
} }
...@@ -9,50 +9,56 @@ namespace mace { ...@@ -9,50 +9,56 @@ namespace mace {
namespace kernels { namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
template <> extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, const Tensor *bias, const bool fused_relu,
const Tensor *filter, const uint32_t stride, const int *padding,
const Tensor *bias, const DataType dt, Tensor *output);
Tensor *output) {
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = { static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr}, {nullptr, nullptr},
{Conv2dOpenclK3x3S1, nullptr}, {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr}, {nullptr, nullptr},
{nullptr, nullptr}}; {nullptr, nullptr}};
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || if (!input->is_image() || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer MACE_NOT_IMPLEMENTED;
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
...@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input->shape().data(), filter->shape().data(), dilations_, input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data()); strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) { std::vector<size_t> output_image_shape;
std::vector<size_t> output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
} else { } else {
output->Resize(output_shape); Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
} }
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, paddings.data(), output);
} }
template
struct Conv2dFunctor<DeviceType::OPENCL, float>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -5,83 +5,44 @@ ...@@ -5,83 +5,44 @@
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void Conv1x1V2(const Tensor *input, void Conv1x1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const bool fused_relu,
Tensor *output) { const int stride,
const DataType dt,
Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t channels = output->dim(1); const index_t height = output->dim(1);
const index_t height = output->dim(2); const index_t width = output->dim(2);
const index_t width = output->dim(3); const index_t channels = output->dim(3);
const index_t input_channels = input->dim(1); const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
auto runtime = OpenCLRuntime::Get(); const index_t input_width = input->dim(2);
auto program = runtime->program(); const index_t input_channels = input->dim(3);
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(input->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias != nullptr) {
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 2, kwg_size / 2),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
void Conv1x1V3(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_channels = input->dim(1);
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DSTRIDE_1"); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
...@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input, ...@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
} }
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error; cl_int error;
error = command_queue.enqueueNDRangeKernel( error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height), cl::NDRange(static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)), static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8), cl::NDRange(4, 15, 8), // TODO auto tuning
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
extern void Conv2dOpenclK1x1S1(const Tensor *input, extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const bool fused_relu,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t batch = output->dim(0); Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3);
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
Conv1x1V2(input, filter, bias, 1, output);
}; };
extern void Conv2dOpenclK1x1S2(const Tensor *input, extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const bool fused_relu,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
MACE_CHECK(input->dim(0) == output->dim(0)); Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
Conv1x1V2(input, filter, bias, 2, output);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -12,8 +12,9 @@ namespace mace { ...@@ -12,8 +12,9 @@ namespace mace {
namespace kernels { namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride, const Tensor *bias, const bool fused_relu,
const int *padding, Tensor *output) { const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0; uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
...@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2))); conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3))); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height)); conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[0] / 2);
...@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel, cl::NullRange, conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks), cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)), static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8), cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *filter,
Conv2d3x3S12(input, filter, bias, 1, padding, output); const Tensor *bias,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
}; };
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S2(const Tensor *input,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *filter,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
}; };
} // namespace kernels } // namespace kernels
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
if (bias != nullptr) {
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
} // namespace kernels
} // namespace mace
...@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, ...@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options); auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/fused_conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr},
{Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
FusedConv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
}
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
}
template
struct FusedConv2dFunctor<DeviceType::OPENCL, float>;
template
struct FusedConv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
...@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
std::string DataTypeToCLType(const DataType dt) { std::string DtToCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "float"; return "float";
case DT_HALF: case DT_HALF:
return "half"; return "half";
case DT_UINT8:
return "uchar";
case DT_INT8:
return "char";
case DT_DOUBLE:
return "double";
case DT_INT32:
return "int";
case DT_UINT32:
return "int";
case DT_UINT16:
return "ushort";
case DT_INT16:
return "short";
case DT_INT64:
return "long";
default: default:
LOG(FATAL) << "Unsupported data type"; LOG(FATAL) << "Unsupported data type";
return ""; return "";
} }
} }
std::string DataTypeToOPENCLCMDDataType(const DataType dt) { std::string DtToCLCMDDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "f"; return "f";
...@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) { ...@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
} }
} }
std::string DtToUpstreamCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpstreamCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape); std::vector<size_t> &image_shape);
std::string DataTypeToOPENCLCMDDataType(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
std::string DataTypeToCLType(const DataType dt); std::string DtToUpstreamCLCMDDt(const DataType dt);
std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -10,131 +10,94 @@ ...@@ -10,131 +10,94 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Pooling3(const Tensor *input, static void Pooling(const Tensor *input,
const int *stride, const int *stride,
const PoolingType type, const int *paddings,
Tensor *output) { const int pooling_size,
if (type != MAX) { const PoolingType type,
MACE_NOT_IMPLEMENTED; const DataType dt,
} Tensor *output) {
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t out_height = output->dim(1);
index_t out_height = output->dim(2); index_t out_width = output->dim(2);
index_t out_width = output->dim(3); index_t channels = output->dim(3);
index_t channel_blk = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_width = (out_width + 3) / 4 ;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(pixel_width * out_height), static_cast<uint32_t>(batch * out_height),
}; };
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
}
if (type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
const uint32_t lws[3] = {1, 8, 128}; uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0; uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height)); pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width)); pooling_kernel.setArg(idx++, paddings[0] / 2);
pooling_kernel.setArg(idx++, paddings[1] / 2);
pooling_kernel.setArg(idx++, stride[0]); pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << error;
} }
static void PoolingN(const Tensor *input, template<typename T>
const int *stride, void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *paddings, Tensor *output) {
const int pooling_size, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
const PoolingType type, std::vector<index_t> output_shape(4);
Tensor *output) { std::vector<int> paddings(2);
if (type != AVG) { std::vector<index_t> filter_shape = {
MACE_NOT_IMPLEMENTED; kernels_[0], kernels_[1],
} input->dim(3), input->dim(3)
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(out_height * out_width),
}; };
auto runtime = OpenCLRuntime::Get(); kernels::CalcNHWCPaddingAndOutputSize(
std::set<std::string> built_options; input->shape().data(), filter_shape.data(),
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); dilations_, strides_, this->padding_,
auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); output_shape.data(), paddings.data());
const uint32_t lws[3] = {1, 8, 128}; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); DataTypeToEnum<T>::value, output);
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, paddings[0]);
pooling_kernel.setArg(idx++, paddings[1]);
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output) {
int paddings[2];
std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#define POOLING_HELPER \
switch(kernels_[0]) { \
case 3: \
Pooling3(input, strides_, pooling_type_, output); \
break; \
default: \
PoolingN(input, strides_, paddings, kernels_[0], \
pooling_type_, output); \
break; \
}
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
input = &padded_input;
POOLING_HELPER
} else {
POOLING_HELPER
}
#undef POOLING_HELPER
} }
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -6,24 +6,33 @@ ...@@ -6,24 +6,33 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <> template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()( void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *resize_dims, Tensor *output) { const Tensor *input, const Tensor *resize_dims, Tensor *output) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t channels = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_height = input->dim(2); const index_t in_width = input->dim(2);
const index_t in_width = input->dim(3); const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
index_t out_height; index_t out_height;
index_t out_width; index_t out_width;
GetOutputSize(resize_dims, &out_height, &out_width); GetOutputSize(resize_dims, &out_height, &out_width);
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape {batch, channels, out_height, out_width}; std::vector<index_t> output_shape {batch, out_height, out_width, channels};
output->Resize(out_shape); if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
...@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0; uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
rb_kernel.setArg(idx++, height_scale); rb_kernel.setArg(idx++, height_scale);
rb_kernel.setArg(idx++, width_scale); rb_kernel.setArg(idx++, width_scale);
rb_kernel.setArg(idx++, static_cast<int>(in_height)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
rb_kernel.setArg(idx++, static_cast<int>(in_width)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel( cl_int error = command_queue.enqueueNDRangeKernel(
rb_kernel, cl::NullRange, rb_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch * channels), cl::NDRange(static_cast<int32_t>(channel_blocks),
static_cast<int>(out_height), static_cast<int>(out_width)), static_cast<int32_t>(out_width),
// TODO (heliangliang) tuning and fix when kwg_size < devisor static_cast<int32_t>(out_height * batch)),
cl::NDRange(1, 16, kwg_size / 16), // TODO tuning
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te ...@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
Tensor *batch_tensor) { Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options); auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -18,36 +18,66 @@ enum PoolingType { ...@@ -18,36 +18,66 @@ enum PoolingType {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> struct PoolingFunctorBase {
struct PoolingFunctor { PoolingFunctorBase(const PoolingType pooling_type,
PoolingFunctor(const PoolingType pooling_type, const int *kernels,
const int *kernels, const int *strides,
const int *strides, const Padding padding,
const Padding padding, const int *dilations)
const int *dilations)
: pooling_type_(pooling_type), : pooling_type_(pooling_type),
kernels_(kernels), kernels_(kernels),
strides_(strides), strides_(strides),
padding_(padding), padding_(padding),
dilations_(dilations) {} dilations_(dilations) {}
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
};
template<DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_,
output_shape.data(), paddings.data());
output_tensor->Resize(output_shape);
Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard in_guard(input_tensor);
Tensor::MappingGuard out_guard(output_tensor); Tensor::MappingGuard out_guard(output_tensor);
const T *input = input_tensor->data<T>(); const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>(); T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
index_t batch = output_shape[0]; index_t batch = output_shape[0];
index_t channels = output_shape[1]; index_t height = output_shape[1];
index_t height = output_shape[2]; index_t width = output_shape[2];
index_t width = output_shape[3]; index_t channels = output_shape[3];
index_t out_image_size = height * width; index_t out_image_size = height * width;
index_t input_channels = input_shape[1]; index_t input_height = input_shape[1];
index_t input_height = input_shape[2]; index_t input_width = input_shape[2];
index_t input_width = input_shape[3]; index_t input_channels = input_shape[3];
index_t in_image_size = input_height * input_width; index_t in_image_size = input_height * input_width;
int kernel_h = kernels_[0]; int kernel_h = kernels_[0];
...@@ -59,11 +89,6 @@ struct PoolingFunctor { ...@@ -59,11 +89,6 @@ struct PoolingFunctor {
int dilation_h = dilations_[0]; int dilation_h = dilations_[0];
int dilation_w = dilations_[1]; int dilation_w = dilations_[1];
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
...@@ -71,25 +96,24 @@ struct PoolingFunctor { ...@@ -71,25 +96,24 @@ struct PoolingFunctor {
if (pooling_type_ == MAX) { if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) { T res = std::numeric_limits<T>::lowest();
T max = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh; int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
max = std::max(max, input[input_offset]); res = std::max(res, input[input_offset]);
} }
} }
} }
output[out_offset] = max; *output = res;
out_offset += 1; output++;
} }
} }
} }
...@@ -97,11 +121,10 @@ struct PoolingFunctor { ...@@ -97,11 +121,10 @@ struct PoolingFunctor {
} else if (pooling_type_ == AVG) { } else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) {
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -110,14 +133,14 @@ struct PoolingFunctor { ...@@ -110,14 +133,14 @@ struct PoolingFunctor {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset]; sum += input[input_offset];
block_size += 1; block_size += 1;
} }
} }
} }
output[out_offset] = sum / block_size; *output = sum / block_size;
out_offset += 1; output++;
} }
} }
} }
...@@ -125,22 +148,26 @@ struct PoolingFunctor { ...@@ -125,22 +148,26 @@ struct PoolingFunctor {
} }
} }
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
}; };
template <> template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()( void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor); Tensor *output_tensor);
template <> template<typename T>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()( struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Tensor *input_tensor, PoolingFunctor(const PoolingType pooling_type,
Tensor *output_tensor); const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -61,63 +61,90 @@ void ResizeImage(const T *images, ...@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
const index_t channels, const index_t channels,
const std::vector<CachedInterpolation> &xs_vec, const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys, const std::vector<CachedInterpolation> &ys,
float *output) { T *output) {
const index_t in_channel_size = in_height * in_width; const index_t in_batch_num_values = channels * in_height * in_width;
const index_t in_batch_num_values = channels * in_channel_size; const index_t out_batch_num_values = channels * out_height * out_width;
const index_t out_channel_size = out_height * out_width;
const index_t out_batch_num_values = channels * out_channel_size;
const CachedInterpolation *xs = xs_vec.data(); const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2) #pragma omp parallel for
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) { const T *batch_input_ptr = images + in_batch_num_values * b;;
const T *input_ptr = T *batch_output_ptr = output + out_batch_num_values * b;
images + in_batch_num_values * b + in_channel_size * c;
float *output_ptr = for (index_t y = 0; y < out_height; ++y) {
output + out_batch_num_values * b + out_channel_size * c; const T *y_lower_input_ptr =
for (index_t y = 0; y < out_height; ++y) { batch_input_ptr + ys[y].lower * in_width * channels;
const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width; const T *y_upper_input_ptr =
const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width; batch_input_ptr + ys[y].upper * in_width * channels;
const float ys_lerp = ys[y].lerp; T *y_output_ptr = batch_output_ptr + y * out_width * channels;
for (index_t x = 0; x < out_width; ++x) { const float ys_lerp = ys[y].lerp;
auto xs_lower = xs[x].lower;
auto xs_upper = xs[x].upper; for (index_t x = 0; x < out_width; ++x) {
auto xs_lerp = xs[x].lerp; const float xs_lerp = xs[x].lerp;
const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
const float top_left = ys_input_lower_ptr[xs_lower]; const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
const float top_right = ys_input_lower_ptr[xs_upper]; const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
const float bottom_left = ys_input_upper_ptr[xs_lower]; const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
const float bottom_right = ys_input_upper_ptr[xs_upper]; T *output_ptr = y_output_ptr + x * channels;
output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left, for (index_t c = 0; c < channels; ++c) {
bottom_right, xs_lerp, ys_lerp); const T top_left = top_left_ptr[c];
const T top_right = top_right_ptr[c];
const T bottom_left = bottom_left_ptr[c];
const T bottom_right = bottom_right_ptr[c];
output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp);
} }
output_ptr += out_width;
} }
} }
} }
} }
} }
struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size,
bool align_corners)
: align_corners_(align_corners), size_(size) {}
protected:
void GetOutputSize(const Tensor *resize_dims,
index_t *out_height,
index_t *out_width) {
if (size_[0] < 0 || size_[1] < 0) {
MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
Tensor::MappingGuard resize_dims_mapper(resize_dims);
auto dims_data = resize_dims->data<int32_t>();
*out_height = dims_data[0];
*out_width = dims_data[1];
} else {
*out_height = size_[0];
*out_width = size_[1];
}
}
bool align_corners_;
std::vector<index_t> size_;
};
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ResizeBilinearFunctor { struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
public:
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: align_corners_(align_corners), size_(size) {} : ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *resize_dims, const Tensor *resize_dims,
Tensor *output) { Tensor *output) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t channels = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_height = input->dim(2); const index_t in_width = input->dim(2);
const index_t in_width = input->dim(3); const index_t channels = input->dim(3);
index_t out_height; index_t out_height;
index_t out_width; index_t out_width;
GetOutputSize(resize_dims, &out_height, &out_width); GetOutputSize(resize_dims, &out_height, &out_width);
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width}; std::vector<index_t> out_shape{batch, out_height, out_width, channels};
output->Resize(out_shape); output->Resize(out_shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
...@@ -146,32 +173,18 @@ class ResizeBilinearFunctor { ...@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
channels, xs, ys, output_data); channels, xs, ys, output_data);
} }
};
protected: template<typename T>
void GetOutputSize(const Tensor *resize_dims, struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
index_t *out_height, ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
index_t *out_width) { : ResizeBilinearFunctorBase(size, align_corners) {}
if (size_[0] < 0 || size_[1] < 0) {
MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
Tensor::MappingGuard resize_dims_mapper(resize_dims);
auto dims_data = resize_dims->data<int32_t>();
*out_height = dims_data[0];
*out_width = dims_data[1];
} else {
*out_height = size_[0];
*out_width = size_[1];
}
}
private: void operator()(const Tensor *input,
bool align_corners_; const Tensor *resize_dims,
std::vector<index_t> size_; Tensor *output);
}; };
template <>
void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input, const Tensor *resize_dims, Tensor *output);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -22,4 +22,10 @@ def if_android_arm64(a): ...@@ -22,4 +22,10 @@ def if_android_arm64(a):
return select({ return select({
"//mace:android_arm64": a, "//mace:android_arm64": a,
"//conditions:default": [], "//conditions:default": [],
}) })
\ No newline at end of file
def if_profiling(a):
return select({
"//mace:is_profiling": a,
"//conditions:default": [],
})
...@@ -6,12 +6,26 @@ ...@@ -6,12 +6,26 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
namespace mace { namespace mace {
template<DeviceType D, class T> template <DeviceType D, class T>
class AddNOp : public Operator<D, T> { class AddNOp : public Operator<D, T> {
public: public:
AddNOp(const OperatorDef &operator_def, Workspace *ws) AddNOp(const OperatorDef &operator_def, Workspace *ws)
...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> { ...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
bool Run() override { bool Run() override {
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
int n = this->inputs_.size(); int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr); vector<const Tensor *> inputs(n, nullptr);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
...@@ -9,47 +9,69 @@ ...@@ -9,47 +9,69 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int n, int size) { static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpDefBuilder op_def_builder("AddN", "AddNBM"); // Add input data
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str()); net.AddRandomInput<D, float>(
internal::MakeString("Input", i).c_str(), {n, h, w, c});
} }
op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
// Add input data if (D == DeviceType::OPENCL) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size}); BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
internal::MakeString("InputImage", i).c_str(),
kernels::BufferType::IN_OUT);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} }
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
} }
#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \ #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \ static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
const int64_t tot = static_cast<int64_t>(iters) * N * SIZE; \ int iters) { \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::ItemsProcessed(tot); \
AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
} \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE) } \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(N, SIZE, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \ #define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, NEON); BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN(10, 1000, float);
BM_ADDN(10, 10000, float); BM_ADDN(2, 1, 240, 240, 256, float);
BM_ADDN(100, 1000, float); // BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN(100, 10000, float); BM_ADDN(4, 1, 240, 240, 256, float);
} // namespace mace // BM_ADDN(4, 1, 240, 240, 256, half);
\ No newline at end of file
} // namespace mace
...@@ -9,7 +9,7 @@ namespace mace { ...@@ -9,7 +9,7 @@ namespace mace {
class AddnOpTest : public OpsTestBase {}; class AddnOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void SimpleAdd2() { void SimpleAdd2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -20,30 +20,26 @@ void SimpleAdd2() { ...@@ -20,30 +20,26 @@ void SimpleAdd2() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd2) { TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
SimpleAdd2<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd2) { /*
SimpleAdd2<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
SimpleAdd2<DeviceType::OPENCL>(); */
}
template<DeviceType D> template <DeviceType D>
void SimpleAdd3() { void SimpleAdd3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -55,62 +51,80 @@ void SimpleAdd3() { ...@@ -55,62 +51,80 @@ void SimpleAdd3() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd3) { TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
SimpleAdd3<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd3) { /*
SimpleAdd3<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
} */
template<DeviceType D> template <DeviceType D>
void RandomTest() { void RandomTest() {
// Construct graph testing::internal::LogToStderr();
OpsTestNet net; srand(time(NULL));
OpDefBuilder("AddN", "AddNTest")
.Input("Input1") for (int round = 0; round < 10; ++round) {
.Input("Input2") // generate random input
.Output("Output") index_t n = 1 + (rand() % 5);
.Finalize(net.NewOperatorDef()); index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
// Add input data index_t c = 1 + (rand() % 32);
net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4}); int input_num = 2 + rand() % 3;
net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4}); // Construct graph
OpsTestNet net;
// Check auto op_def = OpDefBuilder("AddN", "AddNTest");
net.RunOp(D); for (int i = 0; i < input_num; ++i) {
op_def.Input("Input" + ToString(i));
Tensor result; }
result.Copy(*net.GetOutput("Output")); op_def.Output("Output").Finalize(net.NewOperatorDef());
// Run // Add input data
net.RunOp(); for (int i = 0; i < input_num; ++i) {
net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5); }
}
// run on cpu
TEST_F(AddnOpTest, CPURandom) { net.RunOp();
RandomTest<DeviceType::CPU>(); // Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(net, "Input" + ToString(i),
"InputImage" + ToString(i),
kernels::BufferType::IN_OUT);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input("InputImage" + ToString(i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
}
} }
TEST_F(AddnOpTest, NEONRandom) { TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
RandomTest<DeviceType::NEON>();
}
TEST_F(AddnOpTest, OPENCLRandom) {
RandomTest<DeviceType::OPENCL>();
}
} // namespace mace } // namespace mace
...@@ -6,12 +6,26 @@ ...@@ -6,12 +6,26 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, float>);
} // namespace mace REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
\ No newline at end of file .TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, half>);
} // namespace mace
...@@ -13,28 +13,45 @@ static void BatchNorm( ...@@ -13,28 +13,45 @@ static void BatchNorm(
int iters, int batch, int channels, int height, int width) { int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming(); mace::testing::StopTiming();
if ( D == OPENCL )
OpenCLRuntime::EnableProfiling();
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Scale", {channels}); net.AddRandomInput<D, T>("Scale", {channels});
net.AddRandomInput<D, T>("Offset", {channels}); net.AddRandomInput<D, T>("Offset", {channels});
net.AddRandomInput<D, T>("Mean", {channels}); net.AddRandomInput<D, T>("Mean", {channels});
net.AddRandomInput<D, T>("Var", {channels}, true); net.AddRandomInput<D, T>("Var", {channels}, true);
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
else {
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(D); net.RunOp(D);
......
...@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {}; ...@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {};
template <DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
// Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}); net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
net.AddInputFromArray<D, float>("Offset", {1}, {2.0}); net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
...@@ -32,12 +22,44 @@ void Simple() { ...@@ -32,12 +22,44 @@ void Simple() {
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}); net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
...@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) { ...@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
Simple<DeviceType::CPU>(); Simple<DeviceType::CPU>();
} }
/*
TEST_F(BatchNormOpTest, SimpleNEON) { TEST_F(BatchNormOpTest, SimpleNEON) {
Simple<DeviceType::NEON>(); Simple<DeviceType::NEON>();
} }
*/
TEST_F(BatchNormOpTest, SimpleOPENCL) { TEST_F(BatchNormOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>(); Simple<DeviceType::OPENCL>();
} }
/*
TEST_F(BatchNormOpTest, SimpleRandomNeon) { TEST_F(BatchNormOpTest, SimpleRandomNeon) {
srand(time(NULL)); srand(time(NULL));
...@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) { ...@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
*/
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL)); srand(time(NULL));
...@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
index_t channels = 3 + rand() % 50; index_t channels = 3 + rand() % 50;
index_t height = 64; index_t height = 64;
index_t width = 64; index_t width = 64;
// Construct graph // Construct graph
auto &net = test_net(); auto &net = test_net();
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width}); net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true); net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// tuning // run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
net.Sync();
// Check ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
Tensor expected; ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
...@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
index_t channels = 3 + rand() % 50; index_t channels = 3 + rand() % 50;
index_t height = 103; index_t height = 103;
index_t width = 113; index_t width = 113;
// Construct graph // Construct graph
auto &net = test_net(); auto &net = test_net();
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width}); net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true); net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
...@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
net.Sync(); net.Sync();
// Check ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
Tensor expected; ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
} }
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND")
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<float>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) { ...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
} }
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgMedia) { TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
} }
...@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) { ...@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) {
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
} }
template<DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
// Run
net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3);
}
TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
}
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle")
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::CPU, float>);
} // namespace mace } // namespace mace
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::CPU, float>);
} // namespace mace } // namespace mace
...@@ -6,12 +6,31 @@ ...@@ -6,12 +6,31 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::CPU, half>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -33,9 +33,9 @@ static void Conv2d(int iters, ...@@ -33,9 +33,9 @@ static void Conv2d(int iters,
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -44,6 +44,7 @@ static void Conv2d(int iters, ...@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -54,6 +55,7 @@ static void Conv2d(int iters, ...@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
...@@ -91,39 +93,39 @@ static void Conv2d(int iters, ...@@ -91,39 +93,39 @@ static void Conv2d(int iters,
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet // ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float); BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
// Test RGB <-> YUV // Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments //BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); //BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); //BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); //BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
} // namespace mace } // namespace mace
...@@ -84,23 +84,23 @@ TEST_F(Conv2dOpTest, NEONSimple) { ...@@ -84,23 +84,23 @@ TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3SAME<DeviceType::NEON>(); TestSimple3x3SAME<DeviceType::NEON>();
} }
template<DeviceType D> template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() { void TestNHWCSimple3x3VALID() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1}, "Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() { ...@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -125,33 +126,34 @@ void TestNHWCSimple3x3VALID() { ...@@ -125,33 +126,34 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f}); auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
template<DeviceType D> template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() { void TestNHWCSimple3x3SAME() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1}, "Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() { ...@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -176,6 +179,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -176,6 +179,7 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -185,17 +189,17 @@ void TestNHWCSimple3x3SAME() { ...@@ -185,17 +189,17 @@ void TestNHWCSimple3x3SAME() {
{1, 3, 3, 1}, {1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(Conv2dOpTest, CPUSimple) { TEST_F(Conv2dOpTest, CPUSimple) {
TestNHWCSimple3x3VALID<DeviceType::CPU>(); TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestNHWCSimple3x3SAME<DeviceType::CPU>(); TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
} }
TEST_F(Conv2dOpTest, OPENCLSimple) { TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL>(); TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL>(); TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
} }
template<DeviceType D> template<DeviceType D>
...@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) { ...@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>(); TestSimple3x3WithoutBias<DeviceType::NEON>();
} }
template<DeviceType D> template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() { void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1}, "Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
...@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f}); auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(Conv2dOpTest, CPUWithoutBias) { TEST_F(Conv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU>(); TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
} }
TEST_F(Conv2dOpTest, OPENCLWithoutBias) { TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>(); TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
} }
template<DeviceType D> template<DeviceType D>
...@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) { ...@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>(); TestCombined3x3<DeviceType::NEON>();
} }
template<DeviceType D> template<DeviceType D, typename T>
static void TestNHWCCombined3x3() { static void TestNHWCCombined3x3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 2}, "Filter", {3, 3, 2, 2},
{1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f}); 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage") .Input("InputImage")
...@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() { ...@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input") .Input("Input")
...@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() { ...@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -388,27 +396,22 @@ static void TestNHWCCombined3x3() { ...@@ -388,27 +396,22 @@ static void TestNHWCCombined3x3() {
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(Conv2dOpTest, CPUCombined) { TEST_F(Conv2dOpTest, CPUStride2) {
TestNHWCCombined3x3<DeviceType::CPU>(); TestNHWCCombined3x3<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::OPENCL, float>();
} }
template<DeviceType D> template<DeviceType D>
void TestConv1x1() { void TestConv1x1() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
...@@ -425,8 +428,37 @@ void TestConv1x1() { ...@@ -425,8 +428,37 @@ void TestConv1x1() {
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check // Check
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
...@@ -445,11 +477,11 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { ...@@ -445,11 +477,11 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>(); TestConv1x1<DeviceType::CPU>();
} }
//TEST_F(Conv2dOpTest, OPENCLConv1x1) { TEST_F(Conv2dOpTest, OPENCLConv1x1) {
// TestConv1x1<DeviceType::OPENCL>(); TestConv1x1<DeviceType::OPENCL>();
//} }
template<DeviceType D> template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
...@@ -457,11 +489,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -457,11 +489,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand(time(NULL)); srand(time(NULL));
// generate random input // generate random input
index_t batch = 3 + rand() % 10; index_t batch = 3 + (rand() % 10);
index_t height = shape[0]; index_t height = shape[0];
index_t width = shape[1]; index_t width = shape[1];
index_t input_channels = shape[2] + rand() % 10; index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + rand() % 10; index_t output_channels = shape[3] + (rand() % 10);
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -472,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -472,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>( net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}); "Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
...@@ -487,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -487,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -499,25 +532,136 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -499,25 +532,136 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
for (int kernel_size : {3}) { for (int kernel_size : {1, 3}) {
for (int stride : {1}) { for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME); func(kernel_size, kernel_size, stride, stride, SAME);
} }
} }
} }
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128}); TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7}); TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape) {
testing::internal::LogToStderr();
srand(time(NULL));
auto func = [&](int stride_h, int stride_w, Padding padding) {
// generate random input
index_t batch = 3 + (rand() % 10);
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t input_channels = filter_shape[2] + (rand() % 10);
index_t output_channels = filter_shape[3] + (rand() % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, float_bias_data);
// Add input data
net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
};
for (int stride : {1, 2}) {
func(stride, stride, VALID);
func(stride, stride, SAME);
}
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 1, 32, 64});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{3, 3, 32, 64});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{15, 1, 256, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 15, 256, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 64});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{1, 1, 5, 7});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{3, 3, 5, 7});
} }
...@@ -6,15 +6,21 @@ ...@@ -6,15 +6,21 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(DepthwiseConv2d, REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::CPU, float>); DepthwiseConv2dOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(DepthwiseConv2d, REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::NEON, float>); DepthwiseConv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(DepthwiseConv2d, REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, float>); DepthwiseConv2dOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/fused_conv_2d.h"
namespace mace {
REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, half>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, half>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_FUSED_CONV_2D_H_
#define MACE_OPS_FUSED_CONV_2D_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/fused_conv_2d.h"
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
template <DeviceType D, typename T>
class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
public:
FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws),
functor_(this->strides_.data(), this->padding_,
this->dilations_.data()) {
}
bool Run() override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
return true;
}
private:
kernels::FusedConv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_FUSED_CONV_2D_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/fused_conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
class FusedConv2dOpTest : public OpsTestBase {};
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1},
{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(FusedConv2dOpTest, CPUSimple) {
TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
}
TEST_F(FusedConv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
}
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
}
TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
}
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {1, 1, 5, 2},
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(FusedConv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>();
}
TEST_F(FusedConv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, float_bias_data);
// Add input data
net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
}
}
}
TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
}
...@@ -6,11 +6,15 @@ ...@@ -6,11 +6,15 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(GlobalAvgPooling, REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
.TypeConstraint<float>("T")
.Build(),
GlobalAvgPoolingOp<DeviceType::CPU, float>); GlobalAvgPoolingOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(GlobalAvgPooling, REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
.TypeConstraint<float>("T")
.Build(),
GlobalAvgPoolingOp<DeviceType::NEON, float>); GlobalAvgPoolingOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
.TypeConstraint<float>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
.TypeConstraint<half>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape, ...@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> &res) { std::vector<T> &res) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1); std::normal_distribution<float> nd(0, 1);
index_t size = std::accumulate(shape.begin(), shape.end(), 1, index_t size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
res.resize(size); res.resize(size);
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
} else {
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
}
} }
template <typename T> template <typename T>
...@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) { ...@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b); EXPECT_DOUBLE_EQ(a, b);
} }
inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) { inline void AssertSameDims(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), y.dtype());
ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs " ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
<< "y.shape [ " << ShapeToString(y) << "]"; << "y.shape [ " << ShapeToString(y) << "]";
} }
template <typename T, bool is_fp = is_floating_point_type<T>::value> template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector; struct Expector;
// Partial specialization for float and double. // Partial specialization for float and double.
template <typename T> template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<T, true> { struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const T &a, const T &b) { ExpectEqual(a, b); } static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
static void Equal(const Tensor &x, const Tensor &y) { static void Equal(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
ExpectEqual(a(i), b(i)); ExpectEqual(a(i), b(i));
} }
} }
static void Near(const Tensor &x, const Tensor &y, const double abs_err) { static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i; << " index = " << i;
...@@ -334,17 +340,18 @@ template <typename T> ...@@ -334,17 +340,18 @@ template <typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<T>::value, static_assert(is_floating_point_type<T>::value,
"T is not a floating point type"); "T is not a floating point type");
Expector<T>::Near(x, y, abs_err); Expector<T, T>::Near(x, y, abs_err);
} }
template <typename T> template <typename EXP_TYPE, typename RES_TYPE>
std::string ToString(const T &input) { void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
std::stringstream ss; static_assert(is_floating_point_type<EXP_TYPE>::value
ss << input; && is_floating_point_type<RES_TYPE>::value,
return ss.str(); "T is not a floating point type");
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
} }
template <DeviceType D> template <DeviceType D, typename T>
void BufferToImage(OpsTestNet &net, void BufferToImage(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net.Sync(); net.Sync();
} }
template <DeviceType D> template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet &net, void ImageToBuffer(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net, ...@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
......
...@@ -6,11 +6,29 @@ ...@@ -6,11 +6,29 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::CPU, half>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> { ...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
// TODO(chenghui): is it kind of a hack?
filter_shape[0] = input->shape()[1];
filter_shape[1] = input->shape()[0];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_(input, output); functor_(input, output);
return true; return true;
}; };
......
...@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) { ...@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4}, "Input", {1, 4, 4, 2},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_SAME) { TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph // Construct graph
...@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { ...@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3}, net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8}); {0, 1, 2, 3, 4, 5, 6, 7, 8});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8}); auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { ...@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 4, 4}, "Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15}); auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 2, 9}, "Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
// Run // Run
net.RunOp(DeviceType::NEON); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17}); auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
template<DeviceType D>
template <DeviceType D>
static void SimpleMaxPooling3S2() { static void SimpleMaxPooling3S2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 1, 3, 9}, "Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
// Run
net.RunOp(D); if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
}
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26}); auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() { ...@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<CPU>(); SimpleMaxPooling3S2<CPU>();
} }
TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>(); SimpleMaxPooling3S2<OPENCL>();
} }
template <DeviceType D> template<DeviceType D, typename T>
static void AlignedMaxPooling3S2(Padding padding) { static void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides,
Padding padding) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
...@@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) { ...@@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) {
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 64, 64}); net.AddRandomInput<D, T>("Input", input_shape);
// Run
net.RunOp(D); // run on cpu
net.RunOp();
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on cpu BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
net.RunOp(); OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001); ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
} }
// TODO(chenghui) : there is a bug. // TODO(chenghui) : there is a bug.
...@@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) { ...@@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) {
//} //}
TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
AlignedMaxPooling3S2<OPENCL>(Padding::VALID); MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
AlignedMaxPooling3S2<OPENCL>(Padding::SAME); MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) {
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
} }
template <DeviceType D> TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
static void UnalignedMaxPooling3S2(Padding padding) { MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph // Construct graph
OpsTestNet net; auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", padding) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 113, 43, 47}); net.AddInputFromArray<DeviceType::CPU, float>(
// Run "Input", {1, 4, 4, 2},
net.RunOp(D); {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
Tensor expected; 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
expected.Copy(*net.GetOutput("Output"));
// Run on cpu // Run
net.RunOp(); net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001); // Check
} auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
// TODO(chenghui) : there is a bug.
//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
// UnalignedMaxPooling3S2<NEON>();
//}
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
} }
template <DeviceType D> template<DeviceType D>
static void SimpleAvgPoolingTest() { static void SimpleAvgPoolingTest() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("InputImage")
.Output("Output") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5}); auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
SimpleAvgPoolingTest<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>(); SimpleAvgPoolingTest<OPENCL>();
} }
template <DeviceType D> template<DeviceType D, typename T>
static void AlignedAvgPoolingTest(Padding padding) { static void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
Padding padding) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {4, 4}) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", {4, 4}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 15, 15}); net.AddRandomInput<D, float>("Input", shape);
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu // run on cpu
net.RunOp(); net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
}
TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
AlignedAvgPoolingTest<NEON>(Padding::VALID);
AlignedAvgPoolingTest<NEON>(Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void UnAlignedAvgPoolingTest(Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("InputImage")
.Output("Output") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {7, 7}) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", {7, 7}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
// Run
net.RunOp(D); net.RunOp(D);
Tensor expected; ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
expected.Copy(*net.GetOutput("Output"));
// Run on cpu ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
net.RunOp(); }
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5); TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
} }
TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
UnAlignedAvgPoolingTest<NEON>(Padding::VALID); AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
UnAlignedAvgPoolingTest<NEON>(Padding::SAME); AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID); AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME); AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
}
...@@ -6,10 +6,16 @@ ...@@ -6,10 +6,16 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu") REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
......
...@@ -6,14 +6,26 @@ ...@@ -6,14 +6,26 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(ResizeBilinear, ResizeBilinearOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("ResizeBilinear")
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(ResizeBilinear, REGISTER_NEON_OPERATOR(OpKeyBuilder("ResizeBilinear")
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::NEON, float>); ResizeBilinearOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(ResizeBilinear, REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::OPENCL, float>); ResizeBilinearOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
.TypeConstraint<half>("T")
.Build(),
ResizeBilinearOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters, ...@@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters,
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
.Input("Input")
.Input("OutSize")
.Output("Output")
.AddIntsArg("size", {output_height, output_width})
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", net.AddRandomInput<D, float>("Input",
{batch, channels, input_height, input_width}); {batch, input_height, input_width, channels});
net.AddInputFromArray<D, index_t>("OutSize", {2}, net.AddInputFromArray<D, index_t>("OutSize", {2},
{output_height, output_width}); {output_height, output_width});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
.Input("InputImage")
.Input("OutSize")
.Output("OutputImage")
.AddIntsArg("size", {output_height, output_width})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
.Input("Input")
.Input("OutSize")
.Output("Output")
.AddIntsArg("size", {output_height, output_width})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters, ...@@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters,
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE) \ #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU); \ BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, NEON); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL); BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
// SNPE 835 GPU: 6870us
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float); BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float); BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float);
BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float); BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float);
......
...@@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { ...@@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
// Add input data // Add input data
vector<float> input(24); vector<float> input(24);
std::iota(begin(input), end(input), 0); std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input); net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2}); net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18}); auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { ...@@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
// Add input data // Add input data
vector<float> input(24); vector<float> input(24);
std::iota(begin(input), end(input), 0); std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input); net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2}); net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19}); auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -65,6 +65,7 @@ template <DeviceType D> ...@@ -65,6 +65,7 @@ template <DeviceType D>
void TestRandomResizeBilinear() { void TestRandomResizeBilinear() {
srand(time(nullptr)); srand(time(nullptr));
testing::internal::LogToStderr(); testing::internal::LogToStderr();
for (int round = 0; round < 10; ++round) { for (int round = 0; round < 10; ++round) {
int batch = 1 + rand() % 5; int batch = 1 + rand() % 5;
int channels = 1 + rand() % 100; int channels = 1 + rand() % 100;
...@@ -72,39 +73,54 @@ void TestRandomResizeBilinear() { ...@@ -72,39 +73,54 @@ void TestRandomResizeBilinear() {
int width = 1 + rand() % 100; int width = 1 + rand() % 100;
int in_height = 1 + rand() % 100; int in_height = 1 + rand() % 100;
int in_width = 1 + rand() % 100; int in_width = 1 + rand() % 100;
int align_corners = rand() % 1;
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("Input")
.Input("OutSize")
.Output("Output")
.AddIntArg("align_corners", 1)
.AddIntsArg("size", {height, width})
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", net.AddRandomInput<D, float>("Input",
{batch, channels, in_height, in_width}); {batch, in_height, in_width, channels});
net.AddInputFromArray<D, int>("OutSize", {2}, {height, width}); net.AddInputFromArray<D, int>("OutSize", {2}, {height, width});
// Run OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
net.RunOp(D); .Input("Input")
Tensor actual; .Input("OutSize")
actual.Copy(*net.GetOutput("Output")); .Output("Output")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.Finalize(net.NewOperatorDef());
// Run on CPU // Run on CPU
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
Tensor *expected = net.GetOutput("Output"); Tensor expected;
expected.Copy(*net.GetOutput("Output"));
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputImage")
.Input("OutSize")
.Output("OutputImage")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT);
} else {
// TODO support NEON
}
// Check // Check
ExpectTensorNear<float>(*expected, actual, 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
} }
} }
/*
TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) { TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::NEON>(); TestRandomResizeBilinear<DeviceType::NEON>();
} }
*/
TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) { TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::OPENCL>(); TestRandomResizeBilinear<DeviceType::OPENCL>();
......
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("SpaceToBatchND")
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
...@@ -67,12 +67,20 @@ message NodeInput { ...@@ -67,12 +67,20 @@ message NodeInput {
optional int32 output_port = 2; optional int32 output_port = 2;
} }
message OutputShape {
repeated int64 dims = 1;
}
message OperatorDef { message OperatorDef {
repeated string input = 1; repeated string input = 1;
repeated string output = 2; repeated string output = 2;
optional string name = 3; optional string name = 3;
optional string type = 4; optional string type = 4;
repeated Argument arg = 5; repeated Argument arg = 5;
repeated OutputShape output_shape = 6;
// Memory optimization: only support one single output op
optional int32 mem_id = 10 [default = -1];
// for hexagon mace-nnlib // for hexagon mace-nnlib
optional uint32 node_id = 100; optional uint32 node_id = 100;
...@@ -82,6 +90,16 @@ message OperatorDef { ...@@ -82,6 +90,16 @@ message OperatorDef {
repeated int32 out_max_byte_size = 104; // only support 32-bit len repeated int32 out_max_byte_size = 104; // only support 32-bit len
} }
// for memory optimization
message MemoryBlock {
optional int32 mem_id = 1;
optional uint32 x = 2;
optional uint32 y = 3;
}
message MemoryArena {
repeated MemoryBlock mem_block = 1;
}
// for hexagon mace-nnlib // for hexagon mace-nnlib
message InputInfo { message InputInfo {
optional string name = 1; optional string name = 1;
......
...@@ -21,7 +21,7 @@ def main(unused_args): ...@@ -21,7 +21,7 @@ def main(unused_args):
if FLAGS.runtime == 'dsp': if FLAGS.runtime == 'dsp':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
input_graph_def, FLAGS.input_node, FLAGS.output_node) input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
else: else:
output_graph_def = tf_converter_lib.convert_to_mace_pb( output_graph_def = tf_converter_lib.convert_to_mace_pb(
input_graph_def) input_graph_def)
...@@ -62,6 +62,11 @@ def parse_args(): ...@@ -62,6 +62,11 @@ def parse_args():
type=str, type=str,
default="softmax", default="softmax",
help="e.g., softmax") help="e.g., softmax")
parser.add_argument(
"--prequantize",
type=bool,
default=False,
help="e.g., False")
return parser.parse_known_args() return parser.parse_known_args()
......
...@@ -18,15 +18,6 @@ def convert_tensor(op, tensor): ...@@ -18,15 +18,6 @@ def convert_tensor(op, tensor):
tensor.name = op.outputs[0].name tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape) shape = list(tf_tensor.shape)
if (op.name.find('pointwise_kernel') != -1 or
op.name.find('depthwise_kernel') != -1 or
op.name.endswith('weights') or
op.name.endswith('kernel')) \
and op.outputs[0].consumers()[0].type.find('Conv') != -1:
if op.outputs[0].consumers()[0].get_attr('data_format') == 'NHWC':
tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
shape = [shape[3], shape[2], shape[0], shape[1]]
# print (tensor.name, shape)
tensor.dims.extend(shape) tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype') tf_dt = op.get_attr('dtype')
...@@ -66,6 +57,12 @@ def convert_ops(unresolved_ops, net_def): ...@@ -66,6 +57,12 @@ def convert_ops(unresolved_ops, net_def):
op_def.type = first_op.type op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs]) op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
padding_arg = op_def.arg.add() padding_arg = op_def.arg.add()
padding_arg.name = 'padding' padding_arg.name = 'padding'
padding_arg.i = padding_mode[first_op.get_attr('padding')] padding_arg.i = padding_mode[first_op.get_attr('padding')]
...@@ -74,7 +71,7 @@ def convert_ops(unresolved_ops, net_def): ...@@ -74,7 +71,7 @@ def convert_ops(unresolved_ops, net_def):
strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) strides_arg.ints.extend(first_op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NCHW' data_format_arg.s = 'NHWC'
if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd': if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
bias_add_op = unresolved_ops[1] bias_add_op = unresolved_ops[1]
...@@ -105,6 +102,12 @@ def convert_ops(unresolved_ops, net_def): ...@@ -105,6 +102,12 @@ def convert_ops(unresolved_ops, net_def):
op_def.type = 'BatchNorm' op_def.type = 'BatchNorm'
op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon])
op_def.output.extend([output.name for output in add_1_op.outputs]) op_def.output.extend([output.name for output in add_1_op.outputs])
output_shapes = []
for output in add_1_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
resolved_count = 7 resolved_count = 7
elif first_op.type == 'Relu6': elif first_op.type == 'Relu6':
...@@ -113,6 +116,12 @@ def convert_ops(unresolved_ops, net_def): ...@@ -113,6 +116,12 @@ def convert_ops(unresolved_ops, net_def):
op_def.type = 'Relu' op_def.type = 'Relu'
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs]) op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
max_limit_arg = op_def.arg.add() max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit' max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6 max_limit_arg.f = 6
...@@ -122,6 +131,12 @@ def convert_ops(unresolved_ops, net_def): ...@@ -122,6 +131,12 @@ def convert_ops(unresolved_ops, net_def):
op_def.type = 'Pooling' op_def.type = 'Pooling'
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs]) op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
pooling_type_arg = op_def.arg.add() pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type' pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[first_op.type] pooling_type_arg.i = pooling_type_mode[first_op.type]
...@@ -136,21 +151,46 @@ def convert_ops(unresolved_ops, net_def): ...@@ -136,21 +151,46 @@ def convert_ops(unresolved_ops, net_def):
kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NCHW' data_format_arg.s = 'NHWC'
elif first_op.type == 'Add': elif first_op.type == 'Add':
op_def = net_def.op.add() op_def = net_def.op.add()
op_def.name = first_op.name op_def.name = first_op.name
op_def.type = "AddN" op_def.type = "AddN"
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs]) op_def.output.extend([output.name for output in first_op.outputs])
elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']: output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
elif first_op.type == 'ConcatV2':
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND',
'BatchToSpaceND', 'BiasAdd', 'FusedBatchNorm']:
op_def = net_def.op.add() op_def = net_def.op.add()
op_def.name = first_op.name op_def.name = first_op.name
op_def.type = first_op.type op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs]) op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
else: else:
raise Exception('Unknown Op: ' + first_op.name) raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type))
pass pass
for i in range(resolved_count): for i in range(resolved_count):
......
...@@ -5,7 +5,7 @@ from dsp_ops import DspOps ...@@ -5,7 +5,7 @@ from dsp_ops import DspOps
from mace.python.tools import graph_util from mace.python.tools import graph_util
# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \ # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node # --runtime dsp --input_node input_node --output_node output_node
padding_mode = { padding_mode = {
'NA': 0, 'NA': 0,
...@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def): ...@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def):
for follow_op in follow_ops: for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef() new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op) new_follow_op.CopyFrom(follow_op)
for i in range(len(follow_op.input)): for i in xrange(len(follow_op.input)):
for k in range(3): for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op) new_ops.append(new_follow_op)
...@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def): ...@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def):
new_net_def = mace_pb2.NetDef() new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values()) new_net_def.tensors.extend(tensor_map.values())
for op in net_def.op: new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
if op.name not in skip_ops:
new_net_def.op.extend([op])
new_net_def.op.extend(new_ops) new_net_def.op.extend(new_ops)
return new_net_def return new_net_def
...@@ -249,29 +247,101 @@ def add_node_id(net_def): ...@@ -249,29 +247,101 @@ def add_node_id(net_def):
return net_def return net_def
def add_input_output_info(net_def, input_node, output_node, graph): def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
for op in net_def.op: input_info = net_def.input_info.add()
if op.name == input_node: input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add() input_info = net_def.input_info.add()
input_info.name = op.name input_info.dims.extend([1,1,1,1])
input_info.node_id = op.node_id input_info.data_type = mace_pb2.DT_FLOAT
input_info.dims.extend(input_tensor.shape.as_list())
input_info.max_byte_size = max_elem_size(input_tensor) output_info = net_def.output_info.add()
input_info.data_type = find_dtype(input_tensor.dtype) output_info.dims.extend(output_tensor.shape.as_list())
elif op.name == output_node: output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add() output_info = net_def.output_info.add()
output_info.name = op.name output_info.dims.extend([1,1,1,1])
output_info.node_id = op.node_id output_info.data_type = mace_pb2.DT_FLOAT
output_info.dims.extend(output_tensor.shape.as_list())
output_info.max_byte_size = max_elem_size(output_tensor)
output_info.data_type = find_dtype(output_tensor.dtype)
return net_def return net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node): def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
new_input_op = mace_pb2.OperatorDef()
new_input_op.name = input_op.name
new_input_op.type = input_op.type
new_input_op.padding = input_op.padding
new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
new_ops.append(new_input_op)
for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
elif op.type == 'OUTPUT':
output_op = op
dequantize_op = get_node_from_map(op_map, output_op.input[0])
if dequantize_op.type == 'Dequantize':
skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
new_output_op = mace_pb2.OperatorDef()
new_output_op.name = output_op.name
new_output_op.type = output_op.type
new_output_op.input.extend(dequantize_op.input)
new_ops.append(new_output_op)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
""" """
nnlib does not have batch norm, so use tensorflow optimizer to fold nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so batch norm with convolution. The fold optimization reorders ops, so
...@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node): ...@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node):
add_output_node(net_def, output_node) add_output_node(net_def, output_node)
# optimized_net_def = reverse_batch_to_space_and_biasadd(net_def) # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
if prequantize:
net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def) net_def_with_node_id = add_node_id(sorted_net_def)
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph) if prequantize:
dtype = mace_pb2.DT_UINT8
else:
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
return final_net_def return final_net_def
...@@ -68,7 +68,7 @@ def main(unused_args): ...@@ -68,7 +68,7 @@ def main(unused_args):
if input_name.endswith('weights:0') and input_name in tensor_shapes: if input_name.endswith('weights:0') and input_name in tensor_shapes:
ksize = tensor_shapes[input_name] ksize = tensor_shapes[input_name]
break break
print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape.as_list(), op.outputs[0].shape.as_list())) print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape))
key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format) key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format)
hist_inc(stats, key) hist_inc(stats, key)
elif op.type in ['FusedResizeAndPadConv2D']: elif op.type in ['FusedResizeAndPadConv2D']:
...@@ -92,6 +92,7 @@ def main(unused_args): ...@@ -92,6 +92,7 @@ def main(unused_args):
size = tensor_values[input_name] size = tensor_values[input_name]
break break
key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners) key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners)
print(key)
hist_inc(stats, key) hist_inc(stats, key)
elif op.type in ['AvgPool', 'MaxPool']: elif op.type in ['AvgPool', 'MaxPool']:
padding = op.get_attr('padding') padding = op.get_attr('padding')
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define MACE_UTILS_UTILS_H_ #define MACE_UTILS_UTILS_H_
#include <sys/time.h> #include <sys/time.h>
#include <sstream>
namespace mace { namespace mace {
template <typename Integer> template <typename Integer>
...@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() { ...@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec); return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
} }
template <typename T>
inline std::string ToString(T v) {
std::ostringstream ss;
ss << v;
return ss.str();
}
} // namespace mace } // namespace mace
#endif // MACE_UTILS_UTILS_H_ #endif // MACE_UTILS_UTILS_H_
...@@ -22,7 +22,10 @@ ANDROID_ABI=arm64-v8a ...@@ -22,7 +22,10 @@ ANDROID_ABI=arm64-v8a
STRIP="" STRIP=""
STRIP="--strip always" STRIP="--strip always"
bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI # for profiling
bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI --define profiling=true
#bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册