提交 eef80d7c 编写于 作者: Y yejianwu

merge with master

...@@ -23,3 +23,11 @@ config_setting( ...@@ -23,3 +23,11 @@ config_setting(
}, },
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
config_setting(
name = "is_profiling",
define_values = {
"profiling": "true",
},
visibility = ["//visibility:public"],
)
...@@ -7,7 +7,7 @@ package( ...@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android") load("//mace:mace.bzl", "if_android", "if_profiling")
cc_library( cc_library(
name = "opencl_runtime", name = "opencl_runtime",
...@@ -19,7 +19,7 @@ cc_library( ...@@ -19,7 +19,7 @@ cc_library(
"runtime/opencl/cl2.hpp", "runtime/opencl/cl2.hpp",
"runtime/opencl/*.h", "runtime/opencl/*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
deps = [ deps = [
":logging", ":logging",
"@opencl_headers//:opencl20_headers", "@opencl_headers//:opencl20_headers",
......
...@@ -1098,7 +1098,7 @@ namespace half_float ...@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor. /// Conversion constructor.
/// \param rhs float to convert /// \param rhs float to convert
explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {} half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
/// Conversion to single-precision. /// Conversion to single-precision.
/// \return single precision value representing expression value /// \return single precision value representing expression value
......
...@@ -13,6 +13,7 @@ namespace { ...@@ -13,6 +13,7 @@ namespace {
static cl_channel_type DataTypeToCLChannelType(const DataType t) { static cl_channel_type DataTypeToCLChannelType(const DataType t) {
switch (t) { switch (t) {
case DT_HALF: case DT_HALF:
return CL_HALF_FLOAT;
case DT_FLOAT: case DT_FLOAT:
return CL_FLOAT; return CL_FLOAT;
case DT_INT8: case DT_INT8:
...@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error; cl_int error;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(), new cl::Image2D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format, img_format,
image_shape[0], image_shape[1], image_shape[0], image_shape[1],
0, nullptr, &error); 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return cl_image; return cl_image;
} }
......
...@@ -6,6 +6,24 @@ ...@@ -6,6 +6,24 @@
namespace mace { namespace mace {
OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {}
OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
const DataType allowed) {
type_constraint_[attr_name] = allowed;
return *this;
}
const std::string OpKeyBuilder::Build() {
static const std::vector<std::string> type_order = {"T"};
std::string key = op_name_;
for (auto type : type_order) {
key += type + "_" + DataTypeToString(type_constraint_[type]);
}
return key;
}
std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() { std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
static std::map<int32_t, OperatorRegistry *> g_device_type_registry; static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
return &g_device_type_registry; return &g_device_type_registry;
...@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def, ...@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws, Workspace *ws,
DeviceType type) { DeviceType type) {
OperatorRegistry *registry = gDeviceTypeRegistry()->at(type); OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
return registry->Create(operator_def.type(), operator_def, ws); const int dtype = ArgumentHelper::GetSingleArgument<OperatorDef, int>(operator_def,
"T",
static_cast<int>(DT_FLOAT));
return registry->Create(OpKeyBuilder(operator_def.type().data())
.TypeConstraint("T", static_cast<DataType>(dtype))
.Build(),
operator_def,
ws);
} }
OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
......
...@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer { ...@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer {
} }
}; };
class OpKeyBuilder {
public:
explicit OpKeyBuilder(const char *op_name);
OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
template <typename T>
OpKeyBuilder &TypeConstraint(const char *attr_name);
const std::string Build();
private:
std::string op_name_;
std::map<std::string, DataType> type_constraint_;
};
template <typename T>
OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
}
#define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \ #define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \
namespace { \ namespace { \
static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \ static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \
......
...@@ -106,10 +106,10 @@ class Registerer { ...@@ -106,10 +106,10 @@ class Registerer {
} }
#define MACE_REGISTER_CREATOR(RegistryName, key, ...) \ #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__)
#define MACE_REGISTER_CLASS(RegistryName, key, ...) \ #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__)
} // namespace mace } // namespace mace
......
...@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() { ...@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() {
return; return;
} }
cl_command_queue_properties properties = 0;
#ifdef __ENABLE_PROFILING
enable_profiling_ = true;
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
#endif
// a context is like a "runtime link" to the device and platform; // a context is like a "runtime link" to the device and platform;
// i.e. communication is possible // i.e. communication is possible
cl::Context context({gpu_device}); cl::Context context({gpu_device});
cl_command_queue_properties properties = 0;
if (enable_profiling_) {
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
}
cl::CommandQueue command_queue(context, gpu_device, properties); cl::CommandQueue command_queue(context, gpu_device, properties);
instance = new OpenCLRuntime(context, gpu_device, command_queue); instance = new OpenCLRuntime(context, gpu_device, command_queue);
...@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() { ...@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() {
} }
cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() { cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first."); MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>(); return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
} }
cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() { cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first."); MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>(); return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
} }
...@@ -139,6 +141,7 @@ const std::map<std::string, std::string> ...@@ -139,6 +141,7 @@ const std::map<std::string, std::string>
OpenCLRuntime::program_map_ = { OpenCLRuntime::program_map_ = {
{"addn", "addn.cl"}, {"addn", "addn.cl"},
{"batch_norm", "batch_norm.cl"}, {"batch_norm", "batch_norm.cl"},
{"conv_2d", "conv_2d.cl"},
{"conv_2d_1x1", "conv_2d_1x1.cl"}, {"conv_2d_1x1", "conv_2d_1x1.cl"},
{"conv_2d_3x3", "conv_2d_3x3.cl"}, {"conv_2d_3x3", "conv_2d_3x3.cl"},
{"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
......
...@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) {
} }
} }
std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"},
{DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"},
{DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt];
}
size_t GetEnumTypeSize(const DataType dt) { size_t GetEnumTypeSize(const DataType dt) {
switch (dt) { switch (dt) {
......
...@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt); ...@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt);
size_t GetEnumTypeSize(const DataType dt); size_t GetEnumTypeSize(const DataType dt);
std::string DataTypeToString(const DataType dt);
template <class T> template <class T>
struct IsValidDataType; struct IsValidDataType;
......
...@@ -24,7 +24,7 @@ cc_library( ...@@ -24,7 +24,7 @@ cc_library(
"*.h", "*.h",
"hexagon/*.h", "hexagon/*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [ deps = [
"//mace/proto:cc_proto", "//mace/proto:cc_proto",
"//mace/core:core", "//mace/core:core",
...@@ -36,7 +36,7 @@ cc_test( ...@@ -36,7 +36,7 @@ cc_test(
name = "dsp_test", name = "dsp_test",
testonly = 1, testonly = 1,
srcs = glob(["*_test.cc"]), srcs = glob(["*_test.cc"]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([ linkopts = if_android([
"-ldl", "-ldl",
"-lm", "-lm",
...@@ -52,7 +52,7 @@ cc_test( ...@@ -52,7 +52,7 @@ cc_test(
name = "dsp_op_test", name = "dsp_op_test",
testonly = 1, testonly = 1,
srcs = glob(["test/*_test.cc"]), srcs = glob(["test/*_test.cc"]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([ linkopts = if_android([
"-ldl", "-ldl",
"-lm", "-lm",
...@@ -64,3 +64,21 @@ cc_test( ...@@ -64,3 +64,21 @@ cc_test(
"//mace/kernels:kernels", "//mace/kernels:kernels",
], ],
) )
cc_binary(
name = "mace_dsp_run",
srcs = [
"tool/mace_dsp_run.cc",
],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([
"-ldl",
"-lm",
]),
linkstatic = 1,
deps = [
":dsp",
"//mace/kernels:kernels",
"//mace/utils:command_line_flags",
],
)
\ No newline at end of file
...@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) { ...@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) {
} }
// input info // input info
const InputInfo& input_info = net_def.input_info()[0]; num_inputs_ = 0;
input_shape_.insert(input_shape_.begin(), for (const InputInfo &input_info: net_def.input_info()) {
input_info.dims().begin(), input_info.dims().end()); vector<index_t> input_shape;
while (input_shape_.size() < 4) { input_shape.insert(input_shape.begin(),
input_shape_.insert(input_shape_.begin(), 1); input_info.dims().begin(), input_info.dims().end());
while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1);
}
input_shapes_.push_back(input_shape);
input_data_types_.push_back(input_info.data_type());
num_inputs_ += 1;
} }
input_data_type_ = input_info.data_type();
// output info // output info
const OutputInfo& output_info = net_def.output_info()[0]; num_outputs_ = 0;
output_shape_.insert(output_shape_.begin(), for (const OutputInfo &output_info: net_def.output_info()) {
output_info.dims().begin(), output_info.dims().end()); vector<index_t> output_shape;
while (output_shape_.size() < 4) { output_shape.insert(output_shape.begin(),
output_shape_.insert(output_shape_.begin(), 1); output_info.dims().begin(), output_info.dims().end());
while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1);
}
output_shapes_.push_back(output_shape);
output_data_types_.push_back(output_info.data_type());
num_outputs_ += 1;
} }
output_data_type_ = output_info.data_type();
bool res = hexagon_nn_prepare(nn_id_) == 0; bool res = hexagon_nn_prepare(nn_id_) == 0;
return res; return res;
...@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() { ...@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME); hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
} }
bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
Tensor *output_tensor) {
LOG(INFO) << "Execute graph: " << nn_id_;
// single input and single output
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
"wrong output bytes inferred.");
}
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor) {
vector<Tensor> input_tensors(3);
vector<Tensor> output_tensors(3);
input_tensors[0].SetDtype(DT_UINT8);
output_tensors[0].SetDtype(DT_UINT8);
input_tensors[0].ResizeLike(input_tensor);
input_tensors[1].Resize({1, 1, 1, 1});
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
}
output_tensor->ResizeLike(output_tensors[0]);
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
return true;
}
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "mace/dsp/hexagon/hexagon_controller.h" #include "mace/dsp/hexagon/hexagon_controller.h"
#include "mace/dsp/hexagon_nn_ops.h" #include "mace/dsp/hexagon_nn_ops.h"
#include "mace/dsp/util/quantize.h"
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
...@@ -23,35 +24,10 @@ class HexagonControlWrapper { ...@@ -23,35 +24,10 @@ class HexagonControlWrapper {
bool Finalize(); bool Finalize();
bool SetupGraph(const NetDef& net_def); bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const std::string &model_file); bool SetupGraph(const std::string &model_file);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) { bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
LOG(INFO) << "Execute graph: " << nn_id_; bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
output_tensor->SetDtype(output_data_type_); vector<Tensor> *output_tensors);
output_tensor->Resize(output_shape_); bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shape_,
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool TeardownGraph(); bool TeardownGraph();
void PrintLog(); void PrintLog();
...@@ -70,11 +46,14 @@ class HexagonControlWrapper { ...@@ -70,11 +46,14 @@ class HexagonControlWrapper {
int nn_id_; int nn_id_;
Serializer serializer_; Serializer serializer_;
Quantizer quantizer_;
vector<index_t> input_shape_;
vector<index_t> output_shape_; vector<vector<index_t>> input_shapes_;
DataType input_data_type_; vector<vector<index_t>> output_shapes_;
DataType output_data_type_; vector<DataType> input_data_types_;
vector<DataType> output_data_types_;
uint32_t num_inputs_;
uint32_t num_outputs_;
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
}; };
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
using namespace mace; using namespace mace;
TEST(HexagonControlerWrapper, GetVersion) { TEST(HexagonControlerWrapper, InputFloat) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
HexagonControlWrapper wrapper; HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion(); VLOG(0) << "version: " << wrapper.GetVersion();
...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
wrapper.ResetPerfInfo(); wrapper.ResetPerfInfo();
timeval tv1, tv2; timeval tv1, tv2;
gettimeofday(&tv1, NULL); gettimeofday(&tv1, NULL);
int round = 2; int round = 10;
for (int i = 0; i < round; ++i) { for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
} }
...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
} }
std::cout << std::endl; std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
TEST(HexagonControlerWrapper, PreQuantize) {
testing::internal::LogToStderr();
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
wrapper.PrintGraph();
Tensor input_tensor;
Tensor output_tensor;
input_tensor.Resize({1, 480, 480, 3});
float *input_data = input_tensor.mutable_data<float>();
for (int i = 0; i < input_tensor.size(); ++i) {
input_data[i] = i % 256;
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
wrapper.GetPerfInfo();
wrapper.PrintLog();
const float *output_data = output_tensor.data<float>();
for (int i = 0; i < output_tensor.size(); ++i) {
std::cout << output_data[i] << " ";
}
std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph(); VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize(); wrapper.Finalize();
} }
\ No newline at end of file
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/dsp/hexagon_control_wrapper.h" #include "mace/dsp/hexagon_control_wrapper.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#define RESIZE_BILINEAR_TEST_CHANNELS 128
using namespace mace; using namespace mace;
static NetDef BuildNetDef() { static NetDef BuildNetDef() {
...@@ -17,7 +18,7 @@ static NetDef BuildNetDef() { ...@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
input_op->set_type("INPUT"); input_op->set_type("INPUT");
input_op->set_node_id(0); input_op->set_node_id(0);
input_op->set_padding(0); input_op->set_padding(0);
input_op->add_out_max_byte_size(1000); input_op->add_out_max_byte_size(1200);
// relu op // relu op
OperatorDef *resize_bilinear_op = net.add_op(); OperatorDef *resize_bilinear_op = net.add_op();
...@@ -45,7 +46,7 @@ static NetDef BuildNetDef() { ...@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
input_node_input = resize_bilinear_op->add_node_input(); input_node_input = resize_bilinear_op->add_node_input();
input_node_input->set_node_id(12); input_node_input->set_node_id(12);
input_node_input->set_output_port(0); input_node_input->set_output_port(0);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1200);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1000);
resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1000);
...@@ -64,8 +65,8 @@ static NetDef BuildNetDef() { ...@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
new_dim_tensor->add_dims(2); new_dim_tensor->add_dims(2);
new_dim_tensor->set_data_type(DataType::DT_INT32); new_dim_tensor->set_data_type(DataType::DT_INT32);
new_dim_tensor->set_node_id(10); new_dim_tensor->set_node_id(10);
new_dim_tensor->add_int32_data(1); new_dim_tensor->add_int32_data(2);
new_dim_tensor->add_int32_data(1); new_dim_tensor->add_int32_data(2);
TensorProto *input_min_tensor = net.add_tensors(); TensorProto *input_min_tensor = net.add_tensors();
input_min_tensor->set_name("input_min"); input_min_tensor->set_name("input_min");
...@@ -86,20 +87,20 @@ static NetDef BuildNetDef() { ...@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
input_info->set_name("input_node"); input_info->set_name("input_node");
input_info->set_node_id(0); input_info->set_node_id(0);
input_info->add_dims(1); input_info->add_dims(1);
input_info->add_dims(2); input_info->add_dims(3);
input_info->add_dims(2); input_info->add_dims(3);
input_info->add_dims(128); input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
input_info->set_data_type(DataType::DT_UINT8); input_info->set_data_type(DataType::DT_UINT8);
input_info->set_max_byte_size(1000); input_info->set_max_byte_size(1200);
OutputInfo *output_info = net.add_output_info(); OutputInfo *output_info = net.add_output_info();
output_info->set_name("output_node"); output_info->set_name("output_node");
output_info->set_node_id(1); output_info->set_node_id(1);
output_info->add_dims(1); output_info->add_dims(1);
output_info->add_dims(1); output_info->add_dims(2);
output_info->add_dims(1); output_info->add_dims(2);
output_info->add_dims(128); output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
output_info->set_data_type(DataType::DT_UINT8); output_info->set_data_type(DataType::DT_UINT8);
output_info->set_max_byte_size(1000); output_info->set_max_byte_size(1200);
return net; return net;
} }
...@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) { ...@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU); Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU);
Tensor input_tensor(cpu_allocator, DT_UINT8); Tensor input_tensor(cpu_allocator, DT_UINT8);
Tensor output_tensor(cpu_allocator, DT_UINT8); Tensor output_tensor(cpu_allocator, DT_UINT8);
input_tensor.Resize({1, 2, 2, 128}); input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS});
output_tensor.Resize({1, 1, 1, 128}); output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS});
uint8_t *input_data = input_tensor.mutable_data<uint8_t>(); uint8_t *input_data = input_tensor.mutable_data<uint8_t>();
const uint8_t *output_data = output_tensor.data<uint8_t>(); const uint8_t *output_data = output_tensor.data<uint8_t>();
for (int c = 0; c < 128; ++c) { for (int wh = 0; wh < 9; ++wh) {
input_data[c] = input_data[c + 128] = input_data[c + 256] for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) {
= input_data[c + 384] = (uint8_t)c; input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh;
}
} }
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
wrapper.PrintLog(); wrapper.PrintLog();
for (int i = 0; i < output_tensor.size(); ++i) { vector<uint8_t> expected {9, 8, 5, 3};
EXPECT_EQ(i, output_data[i]); for (int i = 0; i < 4; ++i) {
for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c)
EXPECT_EQ(expected[i],
output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]);
} }
std::cout << std::endl; std::cout << std::endl;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
/**
* Usage:
* mace_dsp_run --model=mobi_mace.pb \
* --input_shape=1,3,224,224 \
* --input_file=input_data \
* --output_file=mace.out
*/
#include <sys/time.h>
#include <fstream>
#include "mace/dsp/hexagon_control_wrapper.h"
#include "mace/core/net.h"
#include "mace/utils/command_line_flags.h"
using namespace std;
using namespace mace;
void ParseShape(const string &str, vector<index_t> *shape) {
string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
int main(int argc, char **argv) {
string model_file;
string input_shape;
string input_file;
string output_file;
int round = 1;
std::vector<Flag> flag_list = {
Flag("model", &model_file, "model file name"),
Flag("input_shape", &input_shape, "input shape, separated by comma"),
Flag("input_file", &input_file, "input file name"),
Flag("output_file", &output_file, "output file name"),
Flag("round", &round, "round"),
};
string usage = Flags::Usage(argv[0], flag_list);
const bool parse_result = Flags::Parse(&argc, argv, flag_list);
if (!parse_result) {
LOG(ERROR) << usage;
return -1;
}
VLOG(0) << "model: " << model_file << std::endl
<< "input_shape: " << input_shape << std::endl
<< "input_file: " << input_file << std::endl
<< "output_file: " << output_file << std::endl
<< "round: " << round << std::endl;
vector<index_t> shape;
ParseShape(input_shape, &shape);
// load input
Tensor input_tensor;
input_tensor.Resize(shape);
float *input_data = input_tensor.mutable_data<float>();
ifstream in_file(input_file, ios::in | ios::binary);
in_file.read(reinterpret_cast<char *>(input_data),
input_tensor.size() * sizeof(float));
in_file.close();
// execute
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph(model_file);
wrapper.PrintGraph();
Tensor output_tensor;
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
cout << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round
<< endl;
wrapper.GetPerfInfo();
wrapper.PrintLog();
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
// save output
ofstream out_file(output_file, ios::binary);
out_file.write((const char *) (output_tensor.data<float>()),
output_tensor.size() * sizeof(float));
out_file.flush();
out_file.close();
}
\ No newline at end of file
...@@ -20,7 +20,7 @@ cc_library( ...@@ -20,7 +20,7 @@ cc_library(
hdrs = glob([ hdrs = glob([
"*.h", "*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [ deps = [
"//mace/core:core", "//mace/core:core",
], ],
......
...@@ -10,15 +10,23 @@ ...@@ -10,15 +10,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<DeviceType D, typename T> struct AddNFunctorBase {};
struct AddNFunctor {
void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor); Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size(); index_t size = input_tensors[0]->size();
T *output_ptr = output_tensor->mutable_data<T>(); T *output_ptr = output_tensor->mutable_data<T>();
memset(output_ptr, 0, size * sizeof(T)); memset(output_ptr, 0, size * sizeof(T));
int n = input_tensors.size(); int n = input_tensors.size();
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
Tensor::MappingGuard input_map(input_tensors[i]); Tensor::MappingGuard input_map(input_tensors[i]);
const T *input_ptr = input_tensors[i]->data<T>(); const T *input_ptr = input_tensors[i]->data<T>();
for (index_t j = 0; j < size; ++j) { for (index_t j = 0; j < size; ++j) {
...@@ -28,15 +36,17 @@ struct AddNFunctor { ...@@ -28,15 +36,17 @@ struct AddNFunctor {
} }
}; };
template<> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor); const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()( struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
std::vector<const Tensor *> &inputs, Tensor *output); void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_ADDN_H_ #endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
...@@ -28,9 +28,10 @@ struct BatchNormFunctor { ...@@ -28,9 +28,10 @@ struct BatchNormFunctor {
// new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
// new_offset = \offset - mean * common_val; // new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset; // Y = new_scale * X + new_offset;
const index_t n = input->dim(0); const index_t batch = input->dim(0);
const index_t channel = input->dim(1); const index_t height = input->dim(1);
const index_t sample_size = input->dim(2) * input->dim(3); const index_t width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard scale_mapper(scale); Tensor::MappingGuard scale_mapper(scale);
...@@ -48,19 +49,26 @@ struct BatchNormFunctor { ...@@ -48,19 +49,26 @@ struct BatchNormFunctor {
const T *epsilon_ptr = epsilon->data<T>(); const T *epsilon_ptr = epsilon->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
vector<T> new_scale(channels);
vector<T> new_offset(channels);
#pragma omp parallel for #pragma omp parallel for
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channels; ++c) {
T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale; new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
index_t pos = c * sample_size; }
index_t pos = 0;
for (index_t i = 0; i < n; ++i) { #pragma omp parallel for
const T *input_sample_ptr = input_ptr + pos; for (index_t n = 0; n < batch; ++n) {
T *output_sample_ptr = output_ptr + pos; for (index_t h = 0; h < height; ++h) {
for (index_t j = 0; j < sample_size; ++j) { for (index_t w = 0; w < width; ++w) {
output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset; for (index_t c = 0; c < channels; ++c) {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
++pos;
}
} }
pos += channel * sample_size;
} }
} }
} }
...@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()( ...@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output); Tensor *output);
template <> template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( struct BatchNormFunctor<DeviceType::OPENCL, T> {
const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
const Tensor *mean, const Tensor *mean,
const Tensor *var, const Tensor *var,
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output); Tensor *output);
};
} // namepsace kernels } // namepsace kernels
} // namespace mace } // namespace mace
......
...@@ -11,13 +11,23 @@ ...@@ -11,13 +11,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct Conv2dFunctor { struct Conv2dFunctor : Conv2dFunctorBase {
Conv2dFunctor() {}
Conv2dFunctor(const int *strides, Conv2dFunctor(const int *strides,
const Padding &paddings, const Padding &paddings,
const int *dilations) const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {} : Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -76,9 +86,10 @@ struct Conv2dFunctor { ...@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0; T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c];
*output_data = bias_channel; *output_data = bias_channel;
T sum = 0; T sum = 0.0f;
const T *filter_ptr = filter_data + c; const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
...@@ -113,9 +124,6 @@ struct Conv2dFunctor { ...@@ -113,9 +124,6 @@ struct Conv2dFunctor {
} }
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
}; };
template<> template<>
...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output); Tensor *output);
template<>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, template<typename T>
const Tensor *filter, struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const Tensor *bias, Conv2dFunctor(const int *strides,
Tensor *output); const Padding &paddings,
const int *dilations)
: Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_FUSED_CONV_2D_H_
#define MACE_KERNELS_FUSED_CONV_2D_H_
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/conv_2d.h"
namespace mace {
namespace kernels {
struct FusedConv2dFunctorBase {
FusedConv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T>
struct FusedConv2dFunctor : FusedConv2dFunctorBase {
FusedConv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: FusedConv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
T *output_data = output->mutable_data<T>();
T zero_value;
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
zero_value = half_float::half_cast<half>(0.0f);
} else {
zero_value = 0;
}
auto output_size = output->size();
for (int n = 0; n < output_size; ++n) {
*output_data = *output_data < 0 ? zero_value : *output_data;
output_data++;
}
}
};
template<typename T>
struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
FusedConv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: FusedConv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_FUSED_CONV_2D_H_
...@@ -10,7 +10,7 @@ namespace kernels { ...@@ -10,7 +10,7 @@ namespace kernels {
template <> template <>
void AddNFunctor<DeviceType::NEON, float>::operator()( void AddNFunctor<DeviceType::NEON, float>::operator()(
std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
// TODO: neon mem copy // TODO: neon mem copy
index_t size = output_tensor->size(); index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>(); float *output_ptr = output_tensor->mutable_data<float>();
...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()( ...@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
filter_shape[0] = input_tensor->shape()[1];
filter_shape[1] = input_tensor->shape()[1];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, output_shape.data(),
paddings.data());
output_tensor->Resize(output_shape);
const float *input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
Tensor padded_input; Tensor padded_input;
ConstructInputWithPadding(input_tensor, paddings, &padded_input); ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
input = padded_input.data<float>(); input = padded_input.data<float>();
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
#endif #endif
...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 2x2, strides: 2x2 // kernel_size: 2x2, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_2x2s2x2 } else { // AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 3x3, strides: 2x2 // kernel_size: 3x3, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_3x3s2x2 } else { // AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else { // not implement yet } else { // not implement yet
......
...@@ -5,52 +5,83 @@ ...@@ -5,52 +5,83 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { template <typename T>
index_t element_size = input0->NumElements(); static void AddN(const std::vector<const Tensor *> &input_tensors,
index_t blocks = (element_size + 3) / 4; Tensor *output) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const uint32_t gws = blocks; const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype())); auto dt = DataTypeToEnum<T>::value;
auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0; uint32_t idx = 0;
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer()))); for (auto input : input_tensors) {
addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer()))); addn_kernel.setArg(idx++,
addn_kernel.setArg(idx++, static_cast<int32_t>(element_size)); *(static_cast<const cl::Image2D *>(input->buffer())));
addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); }
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange, addn_kernel, cl::NullRange,
cl::NDRange(gws), cl::NDRange(width_pixels, batch_height_pixels),
cl::NDRange(lws), cl::NDRange(64, 16), // TODO fix this
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
} }
template<> template <typename T>
void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors, void AddNFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output_tensor) { const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
if (input_tensors.empty() || input_tensors.front() == nullptr) {
return;
}
size_t size = input_tensors.size(); size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
switch (size) { for (int i = 1; i < size; ++i) {
case 2:Add2(input_tensors[0], input_tensors[1], output_tensor); MACE_CHECK_NOTNULL(input_tensors[i]);
break; MACE_CHECK(batch == input_tensors[i]->dim(0));
default:MACE_NOT_IMPLEMENTED; MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
}; };
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <> template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
...@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *epsilon, const Tensor *epsilon,
Tensor *output) { Tensor *output) {
index_t pixel_size = input->dim(2) * input->dim(3); const index_t batch = input->dim(0);
index_t blocks = (pixel_size + 3) / 4; const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const uint32_t gws[3] = {static_cast<uint32_t>(input->dim(0)), const index_t channel_blocks = RoundUpDiv4(channels);
static_cast<uint32_t>(input->dim(1)),
static_cast<uint32_t>(blocks)}; const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
const std::vector<uint32_t> lws = {1, 1, kwg_size}; const std::vector<uint32_t> lws = {1, kwg_size, 1};
uint32_t idx = 0; uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(scale->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(scale->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(offset->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(offset->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(mean->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(var->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer()))); bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
bm_kernel.setArg(idx++, static_cast<uint32_t>(pixel_size)); bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> { auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
return {{1, 1, 64}, return {{8, 128, 1}, //SNPE size
{1, 1, 64},
{1, 1, 128}, {1, 1, 128},
{1, kwg_size/16, 16}, {1, kwg_size/16, 16},
{1, kwg_size/32, 32}, {1, kwg_size/32, 32},
...@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
func); func);
} }
template
struct BatchNormFunctor<DeviceType::OPENCL, float>;
template
struct BatchNormFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} }
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
string kernel_name; string kernel_name;
switch (type) { switch (type) {
......
#include <common.h> #include <common.h>
// Supported data type: half/float __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__kernel void add2(__global const DATA_TYPE *input0, __read_only image2d_t input1,
__global const DATA_TYPE *input1, #if INPUT_NUM > 2
__private const int size, __read_only image2d_t input2,
__global DATA_TYPE *output) { #endif
int idx = get_global_id(0); #if INPUT_NUM > 3
__read_only image2d_t input3,
#endif
__write_only image2d_t output) {
const int w = get_global_id(0);
const int hb = get_global_id(1);
if (idx + 4 > size) { const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for(; idx < size; ++idx) {
*(output+idx) = *(input0+idx) + *(input1+idx); DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
} DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
} else { DATA_TYPE4 out = in0 + in1;
VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1); #if INPUT_NUM > 2
vstore4(in_data0+in_data1, idx, output); DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
} out = out + in2;
#endif
#if INPUT_NUM > 3
DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
out = out + in3;
#endif
WRITE_IMAGET(output, (int2)(w, hb), out);
} }
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
void kernel batch_norm(global const DATA_TYPE *input, __kernel void batch_norm(__read_only image2d_t input,
global const DATA_TYPE *scale, __read_only image2d_t scale,
global const DATA_TYPE *offset, __read_only image2d_t offset,
global const DATA_TYPE *mean, __read_only image2d_t mean,
global const DATA_TYPE *var, __read_only image2d_t var,
global const DATA_TYPE *epsilon, __global const DATA_TYPE *epsilon,
private const int pixels, __write_only image2d_t output) {
global DATA_TYPE *output, const int ch_blk = get_global_id(0);
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale, const int w = get_global_id(1);
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) { const int hb = get_global_id(2);
const int batch = get_global_id(0); const int width = get_global_size(1);
const int channel = get_global_id(1);
const int channels = get_global_size(1);
const int pixel_offset = get_global_id(2);
const int local_channel = get_local_id(1);
const int local_pixel_idx = get_local_id(2);
if(local_pixel_idx == 0) { DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon)); DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x); DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
} DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
barrier(CLK_LOCAL_MEM_FENCE); DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; const int pos = ch_blk * width + w;
const DATA_TYPE *input_ptr = input + image_offset;
DATA_TYPE *output_ptr = output + image_offset;
const int end = (batch * channels + channel + 1) * pixels;
if ((image_offset+4) > end) {
for (int i = image_offset; i < end; ++i) {
*output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x;
++input_ptr;
++output_ptr;
}
} else {
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
values = values * new_scale[local_channel] + new_offset[local_channel];
vstore4(values, 0, output_ptr);
}
}
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = in * new_scale + new_offset;
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
...@@ -14,4 +14,11 @@ ...@@ -14,4 +14,11 @@
#define CMD_TYPE_STR(cmd, type) cmd##type #define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type) #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
#include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks,
__private const int out_height,
__private const int out_width,
__private const int filter_height,
__private const int filter_width,
__private const int padding_top,
__private const int padding_left) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
DATA_TYPE4 out0 =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif
const int batch_idx = (out_hb / out_height) * in_height;
DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
}
}
}
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * out_width;
int w = out_w_blk;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out0);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out1);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out2);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out3);
}
#include <common.h> #include <common.h>
#define vec_conv_2d_1x1_s1 \
VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr); \
VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
#define vec_conv_2d_1x1_s2 \
VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr); \
VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel); \
VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02); \
VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
#define vec_conv_2d_1x1_compute_loop \
for (int oc = 0; oc < 4; ++oc) { \
VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel); \
out += in0 * weights.x; \
out += in1 * weights.y; \
out += in2 * weights.z; \
out += in3 * weights.w; \
vstore4(out, 0, output_ptr + oc * out_pixel); \
}
#define vec_conv_2d_1x1_compute \
VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr); \
out += in0 * weights.x; \
out += in1 * weights.y; \
out += in2 * weights.z; \
out += in3 * weights.w; \
vstore4(out, 0, output_ptr);
// Supported data type: half/float
__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
__global const DATA_TYPE *filter, /* o, i, kh, kw */
#ifdef BIAS
__global const DATA_TYPE *bias, /* o */
#endif /* defined(BIAS) */
__global DATA_TYPE *output, /* n, c, h, w */
__private const int in_chan_num,
__private const int out_chan_num,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
#ifdef STRIDE_1
const int stride = 1;
#else
const int stride = 2;
#endif
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
int out_chan_len = out_chan_end - out_chan_begin;
int pixel_len = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef BIAS
DATA_TYPE bias_value = bias[out_chan];
#else
DATA_TYPE bias_value = 0;
#endif
for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = bias_value;
}
}
int in_chan = 0;
if (pixel_len == 4) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
int out_chan = out_chan_begin;
for (; out_chan + 3 < out_chan_end; out_chan += 4) {
const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef STRIDE_1
vec_conv_2d_1x1_s1;
#else
vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute_loop;
}
for (; out_chan < out_chan_end; ++out_chan) {
const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef STRIDE_1
vec_conv_2d_1x1_s1;
#else
vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute;
}
}
}
for (; in_chan < in_chan_num; ++in_chan) {
const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
for (int p = 0; p < pixel_len; ++p) {
float in = input_ptr[p*stride];
output_ptr[p] += in * weights;
}
}
}
}
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks, __private const int in_ch_blks,
__private const int height,
__private const int width) { __private const int width) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
...@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0)); #ifdef BIAS
half4 out[4]; DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[0] = (half4)(bias_value.x); DATA_TYPE4 out1 = out0;
out[1] = (half4)(bias_value.y); DATA_TYPE4 out2 = out0;
out[2] = (half4)(bias_value.z); DATA_TYPE4 out3 = out0;
out[3] = (half4)(bias_value.w); #else
DATA_TYPE4 out0 = 0;
int w[4]; DATA_TYPE4 out1 = 0;
w[0] = out_w_blk; DATA_TYPE4 out2 = 0;
w[1] = w[0] + out_w_blks; DATA_TYPE4 out3 = 0;
w[2] = w[1] + out_w_blks; #endif
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[4];
in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
if (w[1] < width) {
// conditional load hurt perf, this branching helps sometimes
in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
}
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
half4 weights[4];
#pragma unroll
for (int c = 0; c < 4; ++c) {
weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int c = 0; c < 4; ++c) {
out[c] += in[c].x * weights[0];
out[c] += in[c].y * weights[1];
out[c] += in[c].z * weights[2];
out[c] += in[c].w * weights[3];
}
in_x_base += width;
}
const int out_x_base = out_ch_blk * width;
write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
if (w[1] >= width) return;
write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
if (w[2] >= width) return;
write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
if (w[3] >= width) return;
write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
}
__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
__read_only image2d_t filter, /* cout%8 * cin, cout/8 */
__read_only image2d_t bias, /* cout%8 * cout/8 */
__write_only image2d_t output,
__private const int in_ch_blks,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int4 w;
#if STRIDE == 1
w.x = out_w_blk;
w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height);
#else
w.x = out_w_blk * 2;
w.y = (out_w_blk + out_w_blks) * 2;
w.z = (out_w_blk + 2 * out_w_blks) * 2;
w.w = (out_w_blk + 3 * out_w_blks) * 2;
int out_hb_idx = (out_hb % height) * 2;
#endif
float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0)); w.x = select(w.x, INT_MIN, w.x >= in_width);
half4 bias_value03 = as_half4(bias_value.xy); w.y = select(w.y, INT_MIN, w.y >= in_width);
half4 bias_value47 = as_half4(bias_value.zw); w.z = select(w.z, INT_MIN, w.z >= in_width);
half4 out[8]; w.w = select(w.w, INT_MIN, w.w >= in_width);
out[0] = (half4)(bias_value03.x);
out[1] = (half4)(bias_value03.y);
out[2] = (half4)(bias_value03.z);
out[3] = (half4)(bias_value03.w);
out[4] = (half4)(bias_value47.x);
out[5] = (half4)(bias_value47.y);
out[6] = (half4)(bias_value47.z);
out[7] = (half4)(bias_value47.w);
int w[4]; out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
w[0] = out_w_blk; -1,
w[1] = w[0] + out_w_blks; out_hb_idx >= in_height);
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int in_x_base = 0; int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
in[wi << 1] = as_half4(in_value.xy);
in[wi << 1 + 1] = as_half4(in_value.zw);
}
// The order matters, load input first then load filter, why? DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2; const int filter_x0 = in_ch_blk << 2;
half4 weights[8]; DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
#pragma unroll DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
for (int wi = 0; wi < 4; ++wi) { DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk)); DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
weights[wi << 1] = as_half4(weights_value.xy);
weights[wi << 1 + 1] = as_half4(weights_value.zw);
}
// Will prefetch L2 improve performance? How to pretch image data? // Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected out0 += in0.x * weights0;
#pragma unroll out0 += in0.y * weights1;
for (int wi = 0; wi < 4; ++wi) { out0 += in0.z * weights2;
int idx = wi << 1; out0 += in0.w * weights3;
out[idx] += in[idx].x * weights[0];
out[idx] += in[idx].y * weights[1]; out1 += in1.x * weights0;
out[idx] += in[idx].z * weights[2]; out1 += in1.y * weights1;
out[idx] += in[idx].w * weights[3]; out1 += in1.z * weights2;
out1 += in1.w * weights3;
++idx; out2 += in2.x * weights0;
out[idx] += in[idx].x * weights[4]; out2 += in2.y * weights1;
out[idx] += in[idx].y * weights[5]; out2 += in2.z * weights2;
out[idx] += in[idx].z * weights[6]; out2 += in2.w * weights3;
out[idx] += in[idx].w * weights[7];
}
in_x_base += width; out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += in_width;
} }
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * width; const int out_x_base = out_ch_blk * width;
float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1])); int out_x_idx = out_w_blk;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
if (w[1] >= width) return; out_x_idx += out_w_blks;
out_value = (float4)(as_float2(out[2]), as_float2(out[3])); if (out_x_idx >= width) return;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
if (w[2] >= width) return; out_x_idx += out_w_blks;
out_value = (float4)(as_float2(out[4]), as_float2(out[5])); if (out_x_idx >= width) return;
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
if (w[3] >= width) return;
out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
} }
...@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int in_channels, __private const int in_ch_blks,
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width,
__private const int padding_top, __private const int padding_top,
...@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
const int in_ch_blks = (in_channels + 3) / 4;
const int rounded_in_ch = in_ch_blks * 4; const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
#ifdef BIAS #ifdef BIAS
out[0] = DATA_TYPE4 out0 =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0]; DATA_TYPE4 out1 = out0;
out[2] = out[0]; DATA_TYPE4 out2 = out0;
out[3] = out[0]; DATA_TYPE4 out3 = out0;
DATA_TYPE4 out4 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
DATA_TYPE4 out4 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif #endif
int w[4]; const int batch_idx = (out_hb / out_height) * in_height;
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
int in_hb[3];
in_hb[0] = height_idx - padding_top;
in_hb[1] = in_hb[0] + 1;
in_hb[2] = in_hb[1] + 1;
// Judge the height border for padding input.
in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int idx = 0; for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36]; int in_hb_value = height_idx + hb_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36]; in_hb_value = select(in_hb_value + batch_idx,
-1,
int filter_idx = in_ch_blk << 2; (in_hb_value < 0 || in_hb_value >= in_height));
int in_idx = in_ch_blk * in_width; for (short width_idx = 0; width_idx < 3; ++width_idx) {
#pragma unroll in_idx = in_ch_blk * in_width;
for (int i = 0; i < 3; ++i) { int in_width_value;
for (int j = 0; j < 3; ++j) { #define READ_INPUT(i) \
idx = i * 12 + j * 4; in_width_value = in_width##i + width_idx; \
int in_width_idx = w[0] + j; in_width_value = select(in_idx + in_width_value, \
// Judge the width border for padding input. -1, \
if (in_width_idx < 0 || in_width_idx >= in_width) { (in_width_value < 0 || in_width_value >= in_width)); \
in[idx + 0] = 0; in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); READ_INPUT(0);
} READ_INPUT(1);
in_width_idx = w[1] + j; READ_INPUT(2);
if (in_width_idx < 0 || in_width_idx >= in_width) { READ_INPUT(3);
in[idx + 1] = 0; READ_INPUT(4);
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); #undef READ_INPUT
}
in_width_idx = w[2] + j; int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
if (in_width_idx < 0 || in_width_idx >= in_width) { weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
in[idx + 2] = 0; weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
} else { weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
}
in_width_idx = w[3] + j; // Will prefetch L2 improve performance? How to pretch image data?
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 3] = 0; // Interleaving load and mul does not improve performance as expected
} else { out0 += in0.x * weights0;
in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); out0 += in0.y * weights1;
} out0 += in0.z * weights2;
out0 += in0.w * weights3;
weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); out1 += in1.x * weights0;
weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); out1 += in1.y * weights1;
weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); out1 += in1.z * weights2;
out1 += in1.w * weights3;
filter_idx += rounded_in_ch;
} out2 += in2.x * weights0;
} out2 += in2.y * weights1;
// Will prefetch L2 improve performance? How to pretch image data? out2 += in2.z * weights2;
out2 += in2.w * weights3;
// Interleaving load and mul does not improve performance as expected
#pragma unroll out3 += in3.x * weights0;
for (int c = 0; c < 4; ++c) { out3 += in3.y * weights1;
for (int i = 0; i < 9; ++i) { out3 += in3.z * weights2;
out[c] += in[c + i * 4].x * weights[0 + i * 4]; out3 += in3.w * weights3;
out[c] += in[c + i * 4].y * weights[1 + i * 4];
out[c] += in[c + i * 4].z * weights[2 + i * 4]; out4 += in4.x * weights0;
out[c] += in[c + i * 4].w * weights[3 + i * 4]; out4 += in4.y * weights1;
out4 += in4.z * weights2;
out4 += in4.w * weights3;
} }
} }
} }
#ifdef FUSED_RELU
// TODO relux
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
out4 = fmax(out4, 0);
#endif
const int out_x_base = out_ch_blk * out_width; const int out_x_base = out_ch_blk * out_width;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, int w = out_w_blk;
(int2)(out_x_base + w[0] + padding_left, out_hb), WRITE_IMAGET(output,
out[0]); (int2)(out_x_base + w, out_hb),
out0);
w[1] += padding_left;
if (w[1] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[1], out_hb), WRITE_IMAGET(output,
out[1]); (int2)(out_x_base + w, out_hb),
out1);
w[2] += padding_left;
if (w[2] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[2], out_hb), WRITE_IMAGET(output,
out[2]); (int2)(out_x_base + w, out_hb),
out2);
w[3] += padding_left;
if (w[3] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[3], out_hb), WRITE_IMAGET(output,
out[3]); (int2)(out_x_base + w, out_hb),
out3);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out4);
} }
#include <common.h> #include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { #ifdef FP16
VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); #define MIN_VALUE -USHRT_MAX
VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
DATA_TYPE row01 = *(input_ptr + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
DATA_TYPE row11 = *(input_ptr + in_width + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
return res;
}
// Supported data type: half/float
__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__global DATA_TYPE *output) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
VEC_DATA_TYPE(DATA_TYPE,4) res;
#ifdef STRIDE_1
res = vec_pooling_3_s1(input_ptr, in_width);
#else #else
res = vec_pooling_3_s2(input_ptr, in_width); #define MIN_VALUE -FLT_MAX
#endif #endif
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = inner_pooling_3(input_ptr, in_width);
input_ptr += stride;
}
}
}
}
int calculate_avg_block_size(const int pos_h, inline int calculate_avg_block_size(const int pool_size,
const int pos_w, const int pos_h,
const int pool_size, const int pos_w,
const int pad_h, const int h_size,
const int pad_w, const int w_size) {
const int h_size, const int h_start = max(0, pos_h);
const int w_size) { const int w_start = max(0, pos_w);
const int h_start = max(0, pos_h - pad_h); const int h_end = min(pos_h + pool_size, h_size);
const int w_start = max(0, pos_w - pad_w); const int w_end = min(pos_w + pool_size, w_size);
const int h_end = min(pos_h + pool_size - pad_h, h_size);
const int w_end = min(pos_w + pool_size - pad_w, w_size);
return (h_end - h_start) * (w_end - w_start); return (h_end - h_start) * (w_end - w_start);
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ __kernel void pooling(__read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_chan_num, __private const int out_height,
__private const int out_height, __private const int pad_top,
__private const int out_width, __private const int pad_left,
__private const int stride, __private const int stride,
__private const int pad_h, __private const int pooling_size,
__private const int pad_w, __write_only image2d_t output) {
__private const int pooling_size, const int out_chan_idx = get_global_id(0);
__global DATA_TYPE *output) { const int out_width_idx = get_global_id(1);
int batch = get_global_id(0); const int out_width = get_global_size(1);
int out_chan_idx = get_global_id(1); const int out_hb_idx = get_global_id(2);
int out_pixel_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height;
const int out_pixel_height = out_pixel_idx / out_width; const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
const int out_pixel_width = out_pixel_idx % out_width; const int in_width_start = out_width_idx * stride - pad_left;
const int in_channel_offset = out_chan_idx * in_width;
const int out_chan_begin = out_chan_idx * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int in_pixel_idx = out_pixel_height * stride * in_width #ifdef POOL_AVG
+ out_pixel_width * stride; DATA_TYPE4 res = 0;
for (int height = 0; height < pooling_size; ++height) {
const int in_pixel = in_height * in_width; int in_height_idx = in_height_start + height;
const int out_pixel = out_height * out_width; in_height_idx = select(batch_idx + in_height_idx,
-1,
const int in_offset = batch * out_chan_num * in_pixel; (in_height_idx < 0 || in_height_idx >= in_height));
const int out_offset = batch * out_chan_num * out_pixel; for (int width = 0; width < pooling_size; ++width) {
const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; int in_width_idx = in_width_start + width;
DATA_TYPE *output_base = output + out_offset + out_pixel_idx; in_width_idx = select(in_channel_offset + in_width_idx,
-1,
const int block_size = calculate_avg_block_size( (in_width_idx < 0 || in_width_idx >= in_width));
out_pixel_height * stride,
out_pixel_width * stride, DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
pooling_size, res = res + in;
pad_h/2, }
pad_w/2, }
in_height - pad_h, const int block_size = calculate_avg_block_size(pooling_size,
in_width - pad_w); in_height_start, in_width_start,
for (int i = out_chan_begin; i < out_chan_end; ++i) { in_height, in_width);
VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; res /= block_size;
DATA_TYPE sum1 = 0.0f; #else
DATA_TYPE *output_ptr = output_base + i * out_pixel; DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
for (int y = 0; y < pooling_size; ++y) { for (int height = 0; height < pooling_size; ++height) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; int in_height_idx = in_height_start + height;
int x = 0; in_height_idx = select(batch_idx + in_height_idx,
for (; x < (pooling_size-8); x += 8) { -1,
VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); (in_height_idx < 0 || in_height_idx >= in_height));
sum8 += data; if (in_height_idx != -1) {
input_ptr += 8; for (int width = 0; width < pooling_size; ++width) {
} int in_width_idx = in_width_start + width;
for (; x < pooling_size; ++x) { in_width_idx = select(in_channel_offset + in_width_idx,
sum1 += *input_ptr; -1,
input_ptr++; (in_width_idx < 0 || in_width_idx >= in_width));
if (in_width_idx != -1) {
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
res = fmax(res, in);
}
} }
} }
VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
} }
#endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
} }
#include <common.h> #include <common.h>
// Supported data type: half/float __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */ __write_only image2d_t output,
__global DATA_TYPE *output /* n * c, h, w */,
__private const float height_scale, __private const float height_scale,
__private const float width_scale, __private const float width_scale,
__private const int in_height, __private const int in_height,
__private const int in_width) { __private const int in_width,
const int c = get_global_id(0); __private const int out_height) {
const int h = get_global_id(1); const int ch_blk = get_global_id(0);
const int w = get_global_id(2); const int ch_blks = get_global_size(0);
const int channels = get_global_size(0); const int w = get_global_id(1);
const int height = get_global_size(1); const int out_width = get_global_size(1);
const int width = get_global_size(2); const int hb = get_global_id(2);
const int b = hb / out_height;
const int h = hb % out_height;
const float h_in = h * height_scale; const float h_in = h * height_scale;
const float w_in = w * width_scale; const float w_in = w * width_scale;
...@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, ...@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
const float h_lerp = h_in - h_lower; const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower; const float w_lerp = w_in - w_lower;
const DATA_TYPE *input_base = input + c * in_height * in_width; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
DATA_TYPE *output_base = output + c * height * width; const int in_w_offset = ch_blk * in_width;
const int in_h_offset = b * in_height;
DATA_TYPE top_left = input_base[h_lower * in_width + w_lower]; DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
DATA_TYPE top_right = input_base[h_lower * in_width + w_upper]; (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower]; DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper]; (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
(int2)(in_w_offset + w_lower, in_h_offset + h_upper));
DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
(int2)(in_w_offset + w_upper, in_h_offset + h_upper));
const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp; DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
output_base[h * width + w] = top + (bottom - top) * h_lerp;
DATA_TYPE4 out = top + (bottom - top) * h_lerp;
const int out_w_offset = ch_blk * out_width;
const int out_h_offset = b * out_height;
WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
} }
...@@ -9,50 +9,56 @@ namespace mace { ...@@ -9,50 +9,56 @@ namespace mace {
namespace kernels { namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
template <> extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, const Tensor *bias, const bool fused_relu,
const Tensor *filter, const uint32_t stride, const int *padding,
const Tensor *bias, const DataType dt, Tensor *output);
Tensor *output) {
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output); Tensor *output);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = { static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr}, {nullptr, nullptr},
{Conv2dOpenclK3x3S1, nullptr}, {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr}, {nullptr, nullptr},
{nullptr, nullptr}}; {nullptr, nullptr}};
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || if (!input->is_image() || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer MACE_NOT_IMPLEMENTED;
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
...@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input->shape().data(), filter->shape().data(), dilations_, input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data()); strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) { std::vector<size_t> output_image_shape;
std::vector<size_t> output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
} else { } else {
output->Resize(output_shape); Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
} }
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, paddings.data(), output);
} }
template
struct Conv2dFunctor<DeviceType::OPENCL, float>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -5,83 +5,44 @@ ...@@ -5,83 +5,44 @@
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void Conv1x1V2(const Tensor *input, void Conv1x1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const bool fused_relu,
Tensor *output) { const int stride,
const DataType dt,
Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t channels = output->dim(1); const index_t height = output->dim(1);
const index_t height = output->dim(2); const index_t width = output->dim(2);
const index_t width = output->dim(3); const index_t channels = output->dim(3);
const index_t input_channels = input->dim(1); const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
auto runtime = OpenCLRuntime::Get(); const index_t input_width = input->dim(2);
auto program = runtime->program(); const index_t input_channels = input->dim(3);
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(input->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias != nullptr) {
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 2, kwg_size / 2),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
void Conv1x1V3(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_channels = input->dim(1);
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DSTRIDE_1"); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
...@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input, ...@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
} }
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error; cl_int error;
error = command_queue.enqueueNDRangeKernel( error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height), cl::NDRange(static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)), static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8), cl::NDRange(4, 15, 8), // TODO auto tuning
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
extern void Conv2dOpenclK1x1S1(const Tensor *input, extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const bool fused_relu,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t batch = output->dim(0); Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3);
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
Conv1x1V2(input, filter, bias, 1, output);
}; };
extern void Conv2dOpenclK1x1S2(const Tensor *input, extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const bool fused_relu,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
MACE_CHECK(input->dim(0) == output->dim(0)); Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
Conv1x1V2(input, filter, bias, 2, output);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -12,8 +12,9 @@ namespace mace { ...@@ -12,8 +12,9 @@ namespace mace {
namespace kernels { namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride, const Tensor *bias, const bool fused_relu,
const int *padding, Tensor *output) { const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0; uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
...@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2))); conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3))); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height)); conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[0] / 2);
...@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel, cl::NullRange, conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks), cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)), static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8), cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *filter,
Conv2d3x3S12(input, filter, bias, 1, padding, output); const Tensor *bias,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
}; };
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S2(const Tensor *input,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *filter,
const Tensor *bias,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
}; };
} // namespace kernels } // namespace kernels
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
if (bias != nullptr) {
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
} // namespace kernels
} // namespace mace
...@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, ...@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options); auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/fused_conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr},
{Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
FusedConv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
}
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
}
template
struct FusedConv2dFunctor<DeviceType::OPENCL, float>;
template
struct FusedConv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
...@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
std::string DataTypeToCLType(const DataType dt) { std::string DtToCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "float"; return "float";
case DT_HALF: case DT_HALF:
return "half"; return "half";
case DT_UINT8:
return "uchar";
case DT_INT8:
return "char";
case DT_DOUBLE:
return "double";
case DT_INT32:
return "int";
case DT_UINT32:
return "int";
case DT_UINT16:
return "ushort";
case DT_INT16:
return "short";
case DT_INT64:
return "long";
default: default:
LOG(FATAL) << "Unsupported data type"; LOG(FATAL) << "Unsupported data type";
return ""; return "";
} }
} }
std::string DataTypeToOPENCLCMDDataType(const DataType dt) { std::string DtToCLCMDDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "f"; return "f";
...@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) { ...@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
} }
} }
std::string DtToUpstreamCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpstreamCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape); std::vector<size_t> &image_shape);
std::string DataTypeToOPENCLCMDDataType(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
std::string DataTypeToCLType(const DataType dt); std::string DtToUpstreamCLCMDDt(const DataType dt);
std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -10,131 +10,94 @@ ...@@ -10,131 +10,94 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Pooling3(const Tensor *input, static void Pooling(const Tensor *input,
const int *stride, const int *stride,
const PoolingType type, const int *paddings,
Tensor *output) { const int pooling_size,
if (type != MAX) { const PoolingType type,
MACE_NOT_IMPLEMENTED; const DataType dt,
} Tensor *output) {
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t out_height = output->dim(1);
index_t out_height = output->dim(2); index_t out_width = output->dim(2);
index_t out_width = output->dim(3); index_t channels = output->dim(3);
index_t channel_blk = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_width = (out_width + 3) / 4 ;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(pixel_width * out_height), static_cast<uint32_t>(batch * out_height),
}; };
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
}
if (type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
const uint32_t lws[3] = {1, 8, 128}; uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0; uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height)); pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width)); pooling_kernel.setArg(idx++, paddings[0] / 2);
pooling_kernel.setArg(idx++, paddings[1] / 2);
pooling_kernel.setArg(idx++, stride[0]); pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << error;
} }
static void PoolingN(const Tensor *input, template<typename T>
const int *stride, void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *paddings, Tensor *output) {
const int pooling_size, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
const PoolingType type, std::vector<index_t> output_shape(4);
Tensor *output) { std::vector<int> paddings(2);
if (type != AVG) { std::vector<index_t> filter_shape = {
MACE_NOT_IMPLEMENTED; kernels_[0], kernels_[1],
} input->dim(3), input->dim(3)
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(out_height * out_width),
}; };
auto runtime = OpenCLRuntime::Get(); kernels::CalcNHWCPaddingAndOutputSize(
std::set<std::string> built_options; input->shape().data(), filter_shape.data(),
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); dilations_, strides_, this->padding_,
auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); output_shape.data(), paddings.data());
const uint32_t lws[3] = {1, 8, 128}; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); DataTypeToEnum<T>::value, output);
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, paddings[0]);
pooling_kernel.setArg(idx++, paddings[1]);
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output) {
int paddings[2];
std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#define POOLING_HELPER \
switch(kernels_[0]) { \
case 3: \
Pooling3(input, strides_, pooling_type_, output); \
break; \
default: \
PoolingN(input, strides_, paddings, kernels_[0], \
pooling_type_, output); \
break; \
}
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
input = &padded_input;
POOLING_HELPER
} else {
POOLING_HELPER
}
#undef POOLING_HELPER
} }
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -6,24 +6,33 @@ ...@@ -6,24 +6,33 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <> template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()( void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *resize_dims, Tensor *output) { const Tensor *input, const Tensor *resize_dims, Tensor *output) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t channels = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_height = input->dim(2); const index_t in_width = input->dim(2);
const index_t in_width = input->dim(3); const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
index_t out_height; index_t out_height;
index_t out_width; index_t out_width;
GetOutputSize(resize_dims, &out_height, &out_width); GetOutputSize(resize_dims, &out_height, &out_width);
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape {batch, channels, out_height, out_width}; std::vector<index_t> output_shape {batch, out_height, out_width, channels};
output->Resize(out_shape); if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
...@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0; uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
rb_kernel.setArg(idx++, height_scale); rb_kernel.setArg(idx++, height_scale);
rb_kernel.setArg(idx++, width_scale); rb_kernel.setArg(idx++, width_scale);
rb_kernel.setArg(idx++, static_cast<int>(in_height)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
rb_kernel.setArg(idx++, static_cast<int>(in_width)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel( cl_int error = command_queue.enqueueNDRangeKernel(
rb_kernel, cl::NullRange, rb_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch * channels), cl::NDRange(static_cast<int32_t>(channel_blocks),
static_cast<int>(out_height), static_cast<int>(out_width)), static_cast<int32_t>(out_width),
// TODO (heliangliang) tuning and fix when kwg_size < devisor static_cast<int32_t>(out_height * batch)),
cl::NDRange(1, 16, kwg_size / 16), // TODO tuning
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te ...@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
Tensor *batch_tensor) { Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype())); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options); auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -18,36 +18,66 @@ enum PoolingType { ...@@ -18,36 +18,66 @@ enum PoolingType {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> struct PoolingFunctorBase {
struct PoolingFunctor { PoolingFunctorBase(const PoolingType pooling_type,
PoolingFunctor(const PoolingType pooling_type, const int *kernels,
const int *kernels, const int *strides,
const int *strides, const Padding padding,
const Padding padding, const int *dilations)
const int *dilations)
: pooling_type_(pooling_type), : pooling_type_(pooling_type),
kernels_(kernels), kernels_(kernels),
strides_(strides), strides_(strides),
padding_(padding), padding_(padding),
dilations_(dilations) {} dilations_(dilations) {}
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
};
template<DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_,
output_shape.data(), paddings.data());
output_tensor->Resize(output_shape);
Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard in_guard(input_tensor);
Tensor::MappingGuard out_guard(output_tensor); Tensor::MappingGuard out_guard(output_tensor);
const T *input = input_tensor->data<T>(); const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>(); T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
index_t batch = output_shape[0]; index_t batch = output_shape[0];
index_t channels = output_shape[1]; index_t height = output_shape[1];
index_t height = output_shape[2]; index_t width = output_shape[2];
index_t width = output_shape[3]; index_t channels = output_shape[3];
index_t out_image_size = height * width; index_t out_image_size = height * width;
index_t input_channels = input_shape[1]; index_t input_height = input_shape[1];
index_t input_height = input_shape[2]; index_t input_width = input_shape[2];
index_t input_width = input_shape[3]; index_t input_channels = input_shape[3];
index_t in_image_size = input_height * input_width; index_t in_image_size = input_height * input_width;
int kernel_h = kernels_[0]; int kernel_h = kernels_[0];
...@@ -59,11 +89,6 @@ struct PoolingFunctor { ...@@ -59,11 +89,6 @@ struct PoolingFunctor {
int dilation_h = dilations_[0]; int dilation_h = dilations_[0];
int dilation_w = dilations_[1]; int dilation_w = dilations_[1];
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
...@@ -71,25 +96,24 @@ struct PoolingFunctor { ...@@ -71,25 +96,24 @@ struct PoolingFunctor {
if (pooling_type_ == MAX) { if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) { T res = std::numeric_limits<T>::lowest();
T max = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh; int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
max = std::max(max, input[input_offset]); res = std::max(res, input[input_offset]);
} }
} }
} }
output[out_offset] = max; *output = res;
out_offset += 1; output++;
} }
} }
} }
...@@ -97,11 +121,10 @@ struct PoolingFunctor { ...@@ -97,11 +121,10 @@ struct PoolingFunctor {
} else if (pooling_type_ == AVG) { } else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) {
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -110,14 +133,14 @@ struct PoolingFunctor { ...@@ -110,14 +133,14 @@ struct PoolingFunctor {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset]; sum += input[input_offset];
block_size += 1; block_size += 1;
} }
} }
} }
output[out_offset] = sum / block_size; *output = sum / block_size;
out_offset += 1; output++;
} }
} }
} }
...@@ -125,22 +148,26 @@ struct PoolingFunctor { ...@@ -125,22 +148,26 @@ struct PoolingFunctor {
} }
} }
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
}; };
template <> template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()( void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor); Tensor *output_tensor);
template <> template<typename T>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()( struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Tensor *input_tensor, PoolingFunctor(const PoolingType pooling_type,
Tensor *output_tensor); const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -61,63 +61,90 @@ void ResizeImage(const T *images, ...@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
const index_t channels, const index_t channels,
const std::vector<CachedInterpolation> &xs_vec, const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys, const std::vector<CachedInterpolation> &ys,
float *output) { T *output) {
const index_t in_channel_size = in_height * in_width; const index_t in_batch_num_values = channels * in_height * in_width;
const index_t in_batch_num_values = channels * in_channel_size; const index_t out_batch_num_values = channels * out_height * out_width;
const index_t out_channel_size = out_height * out_width;
const index_t out_batch_num_values = channels * out_channel_size;
const CachedInterpolation *xs = xs_vec.data(); const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2) #pragma omp parallel for
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) { const T *batch_input_ptr = images + in_batch_num_values * b;;
const T *input_ptr = T *batch_output_ptr = output + out_batch_num_values * b;
images + in_batch_num_values * b + in_channel_size * c;
float *output_ptr = for (index_t y = 0; y < out_height; ++y) {
output + out_batch_num_values * b + out_channel_size * c; const T *y_lower_input_ptr =
for (index_t y = 0; y < out_height; ++y) { batch_input_ptr + ys[y].lower * in_width * channels;
const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width; const T *y_upper_input_ptr =
const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width; batch_input_ptr + ys[y].upper * in_width * channels;
const float ys_lerp = ys[y].lerp; T *y_output_ptr = batch_output_ptr + y * out_width * channels;
for (index_t x = 0; x < out_width; ++x) { const float ys_lerp = ys[y].lerp;
auto xs_lower = xs[x].lower;
auto xs_upper = xs[x].upper; for (index_t x = 0; x < out_width; ++x) {
auto xs_lerp = xs[x].lerp; const float xs_lerp = xs[x].lerp;
const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
const float top_left = ys_input_lower_ptr[xs_lower]; const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
const float top_right = ys_input_lower_ptr[xs_upper]; const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
const float bottom_left = ys_input_upper_ptr[xs_lower]; const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
const float bottom_right = ys_input_upper_ptr[xs_upper]; T *output_ptr = y_output_ptr + x * channels;
output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left, for (index_t c = 0; c < channels; ++c) {
bottom_right, xs_lerp, ys_lerp); const T top_left = top_left_ptr[c];
const T top_right = top_right_ptr[c];
const T bottom_left = bottom_left_ptr[c];
const T bottom_right = bottom_right_ptr[c];
output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp);
} }
output_ptr += out_width;
} }
} }
} }
} }
} }
struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size,
bool align_corners)
: align_corners_(align_corners), size_(size) {}
protected:
void GetOutputSize(const Tensor *resize_dims,
index_t *out_height,
index_t *out_width) {
if (size_[0] < 0 || size_[1] < 0) {
MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
Tensor::MappingGuard resize_dims_mapper(resize_dims);
auto dims_data = resize_dims->data<int32_t>();
*out_height = dims_data[0];
*out_width = dims_data[1];
} else {
*out_height = size_[0];
*out_width = size_[1];
}
}
bool align_corners_;
std::vector<index_t> size_;
};
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ResizeBilinearFunctor { struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
public:
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: align_corners_(align_corners), size_(size) {} : ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *resize_dims, const Tensor *resize_dims,
Tensor *output) { Tensor *output) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t channels = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_height = input->dim(2); const index_t in_width = input->dim(2);
const index_t in_width = input->dim(3); const index_t channels = input->dim(3);
index_t out_height; index_t out_height;
index_t out_width; index_t out_width;
GetOutputSize(resize_dims, &out_height, &out_width); GetOutputSize(resize_dims, &out_height, &out_width);
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, channels, out_height, out_width}; std::vector<index_t> out_shape{batch, out_height, out_width, channels};
output->Resize(out_shape); output->Resize(out_shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
...@@ -146,32 +173,18 @@ class ResizeBilinearFunctor { ...@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
channels, xs, ys, output_data); channels, xs, ys, output_data);
} }
};
protected: template<typename T>
void GetOutputSize(const Tensor *resize_dims, struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
index_t *out_height, ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
index_t *out_width) { : ResizeBilinearFunctorBase(size, align_corners) {}
if (size_[0] < 0 || size_[1] < 0) {
MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
Tensor::MappingGuard resize_dims_mapper(resize_dims);
auto dims_data = resize_dims->data<int32_t>();
*out_height = dims_data[0];
*out_width = dims_data[1];
} else {
*out_height = size_[0];
*out_width = size_[1];
}
}
private: void operator()(const Tensor *input,
bool align_corners_; const Tensor *resize_dims,
std::vector<index_t> size_; Tensor *output);
}; };
template <>
void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input, const Tensor *resize_dims, Tensor *output);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -22,4 +22,10 @@ def if_android_arm64(a): ...@@ -22,4 +22,10 @@ def if_android_arm64(a):
return select({ return select({
"//mace:android_arm64": a, "//mace:android_arm64": a,
"//conditions:default": [], "//conditions:default": [],
}) })
\ No newline at end of file
def if_profiling(a):
return select({
"//mace:is_profiling": a,
"//conditions:default": [],
})
...@@ -6,12 +6,26 @@ ...@@ -6,12 +6,26 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
namespace mace { namespace mace {
template<DeviceType D, class T> template <DeviceType D, class T>
class AddNOp : public Operator<D, T> { class AddNOp : public Operator<D, T> {
public: public:
AddNOp(const OperatorDef &operator_def, Workspace *ws) AddNOp(const OperatorDef &operator_def, Workspace *ws)
...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> { ...@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
bool Run() override { bool Run() override {
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
int n = this->inputs_.size(); int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr); vector<const Tensor *> inputs(n, nullptr);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
...@@ -9,47 +9,69 @@ ...@@ -9,47 +9,69 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int n, int size) { static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpDefBuilder op_def_builder("AddN", "AddNBM"); // Add input data
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str()); net.AddRandomInput<D, float>(
internal::MakeString("Input", i).c_str(), {n, h, w, c});
} }
op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
// Add input data if (D == DeviceType::OPENCL) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size}); BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
internal::MakeString("InputImage", i).c_str(),
kernels::BufferType::IN_OUT);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} }
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
} }
#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \ #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \ static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
const int64_t tot = static_cast<int64_t>(iters) * N * SIZE; \ int iters) { \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::ItemsProcessed(tot); \
AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
} \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE) } \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(N, SIZE, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \ #define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, NEON); BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN(10, 1000, float);
BM_ADDN(10, 10000, float); BM_ADDN(2, 1, 240, 240, 256, float);
BM_ADDN(100, 1000, float); // BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN(100, 10000, float); BM_ADDN(4, 1, 240, 240, 256, float);
} // namespace mace // BM_ADDN(4, 1, 240, 240, 256, half);
\ No newline at end of file
} // namespace mace
...@@ -9,7 +9,7 @@ namespace mace { ...@@ -9,7 +9,7 @@ namespace mace {
class AddnOpTest : public OpsTestBase {}; class AddnOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void SimpleAdd2() { void SimpleAdd2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -20,30 +20,26 @@ void SimpleAdd2() { ...@@ -20,30 +20,26 @@ void SimpleAdd2() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd2) { TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
SimpleAdd2<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd2) { /*
SimpleAdd2<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
SimpleAdd2<DeviceType::OPENCL>(); */
}
template<DeviceType D> template <DeviceType D>
void SimpleAdd3() { void SimpleAdd3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -55,62 +51,80 @@ void SimpleAdd3() { ...@@ -55,62 +51,80 @@ void SimpleAdd3() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18}); auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(AddnOpTest, CPUSimpleAdd3) { TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
SimpleAdd3<DeviceType::CPU>();
}
TEST_F(AddnOpTest, NEONSimpleAdd3) { /*
SimpleAdd3<DeviceType::NEON>(); TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
} */
template<DeviceType D> template <DeviceType D>
void RandomTest() { void RandomTest() {
// Construct graph testing::internal::LogToStderr();
OpsTestNet net; srand(time(NULL));
OpDefBuilder("AddN", "AddNTest")
.Input("Input1") for (int round = 0; round < 10; ++round) {
.Input("Input2") // generate random input
.Output("Output") index_t n = 1 + (rand() % 5);
.Finalize(net.NewOperatorDef()); index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
// Add input data index_t c = 1 + (rand() % 32);
net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4}); int input_num = 2 + rand() % 3;
net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4}); // Construct graph
OpsTestNet net;
// Check auto op_def = OpDefBuilder("AddN", "AddNTest");
net.RunOp(D); for (int i = 0; i < input_num; ++i) {
op_def.Input("Input" + ToString(i));
Tensor result; }
result.Copy(*net.GetOutput("Output")); op_def.Output("Output").Finalize(net.NewOperatorDef());
// Run // Add input data
net.RunOp(); for (int i = 0; i < input_num; ++i) {
net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5); }
}
// run on cpu
TEST_F(AddnOpTest, CPURandom) { net.RunOp();
RandomTest<DeviceType::CPU>(); // Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(net, "Input" + ToString(i),
"InputImage" + ToString(i),
kernels::BufferType::IN_OUT);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input("InputImage" + ToString(i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
}
} }
TEST_F(AddnOpTest, NEONRandom) { TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
RandomTest<DeviceType::NEON>();
}
TEST_F(AddnOpTest, OPENCLRandom) {
RandomTest<DeviceType::OPENCL>();
}
} // namespace mace } // namespace mace
...@@ -6,12 +6,26 @@ ...@@ -6,12 +6,26 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, float>);
} // namespace mace REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
\ No newline at end of file .TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, half>);
} // namespace mace
...@@ -13,28 +13,45 @@ static void BatchNorm( ...@@ -13,28 +13,45 @@ static void BatchNorm(
int iters, int batch, int channels, int height, int width) { int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming(); mace::testing::StopTiming();
if ( D == OPENCL )
OpenCLRuntime::EnableProfiling();
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Scale", {channels}); net.AddRandomInput<D, T>("Scale", {channels});
net.AddRandomInput<D, T>("Offset", {channels}); net.AddRandomInput<D, T>("Offset", {channels});
net.AddRandomInput<D, T>("Mean", {channels}); net.AddRandomInput<D, T>("Mean", {channels});
net.AddRandomInput<D, T>("Var", {channels}, true); net.AddRandomInput<D, T>("Var", {channels}, true);
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
else {
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(D); net.RunOp(D);
......
...@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {}; ...@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {};
template <DeviceType D> template <DeviceType D>
void Simple() { void Simple() {
// Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}); net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
net.AddInputFromArray<D, float>("Offset", {1}, {2.0}); net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
...@@ -32,12 +22,44 @@ void Simple() { ...@@ -32,12 +22,44 @@ void Simple() {
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}); net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
// Run if (D == DeviceType::OPENCL) {
net.RunOp(D); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
...@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) { ...@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
Simple<DeviceType::CPU>(); Simple<DeviceType::CPU>();
} }
/*
TEST_F(BatchNormOpTest, SimpleNEON) { TEST_F(BatchNormOpTest, SimpleNEON) {
Simple<DeviceType::NEON>(); Simple<DeviceType::NEON>();
} }
*/
TEST_F(BatchNormOpTest, SimpleOPENCL) { TEST_F(BatchNormOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>(); Simple<DeviceType::OPENCL>();
} }
/*
TEST_F(BatchNormOpTest, SimpleRandomNeon) { TEST_F(BatchNormOpTest, SimpleRandomNeon) {
srand(time(NULL)); srand(time(NULL));
...@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) { ...@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
*/
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL)); srand(time(NULL));
...@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
index_t channels = 3 + rand() % 50; index_t channels = 3 + rand() % 50;
index_t height = 64; index_t height = 64;
index_t width = 64; index_t width = 64;
// Construct graph // Construct graph
auto &net = test_net(); auto &net = test_net();
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width}); net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true); net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// tuning // run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
net.Sync();
// Check ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
Tensor expected; ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
...@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
index_t channels = 3 + rand() % 50; index_t channels = 3 + rand() % 50;
index_t height = 103; index_t height = 103;
index_t width = 113; index_t width = 113;
// Construct graph // Construct graph
auto &net = test_net(); auto &net = test_net();
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width}); net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true); net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3}); net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
...@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::OPENCL);
net.Sync(); net.Sync();
// Check ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
Tensor expected; ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
} }
} }
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND")
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<float>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) { ...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
} }
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgMedia) { TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
} }
...@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) { ...@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) {
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
} }
template<DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
// Run
net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3);
}
TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
}
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle")
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::CPU, float>);
} // namespace mace } // namespace mace
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::CPU, float>);
} // namespace mace } // namespace mace
...@@ -6,12 +6,31 @@ ...@@ -6,12 +6,31 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::CPU, half>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -33,9 +33,9 @@ static void Conv2d(int iters, ...@@ -33,9 +33,9 @@ static void Conv2d(int iters,
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -44,6 +44,7 @@ static void Conv2d(int iters, ...@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -54,6 +55,7 @@ static void Conv2d(int iters, ...@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
...@@ -91,39 +93,39 @@ static void Conv2d(int iters, ...@@ -91,39 +93,39 @@ static void Conv2d(int iters,
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet // ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float); BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
// Test RGB <-> YUV // Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments //BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); //BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); //BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); //BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
} // namespace mace } // namespace mace
此差异已折叠。
...@@ -6,15 +6,21 @@ ...@@ -6,15 +6,21 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(DepthwiseConv2d, REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::CPU, float>); DepthwiseConv2dOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(DepthwiseConv2d, REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::NEON, float>); DepthwiseConv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(DepthwiseConv2d, REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, float>); DepthwiseConv2dOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/fused_conv_2d.h"
namespace mace {
REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, half>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, half>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_FUSED_CONV_2D_H_
#define MACE_OPS_FUSED_CONV_2D_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/fused_conv_2d.h"
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
template <DeviceType D, typename T>
class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
public:
FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws),
functor_(this->strides_.data(), this->padding_,
this->dilations_.data()) {
}
bool Run() override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
return true;
}
private:
kernels::FusedConv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_FUSED_CONV_2D_H_
此差异已折叠。
...@@ -6,11 +6,15 @@ ...@@ -6,11 +6,15 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(GlobalAvgPooling, REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
.TypeConstraint<float>("T")
.Build(),
GlobalAvgPoolingOp<DeviceType::CPU, float>); GlobalAvgPoolingOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(GlobalAvgPooling, REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
.TypeConstraint<float>("T")
.Build(),
GlobalAvgPoolingOp<DeviceType::NEON, float>); GlobalAvgPoolingOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
namespace mace { namespace mace {
REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
.TypeConstraint<float>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
.TypeConstraint<half>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape, ...@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> &res) { std::vector<T> &res) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1); std::normal_distribution<float> nd(0, 1);
index_t size = std::accumulate(shape.begin(), shape.end(), 1, index_t size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
res.resize(size); res.resize(size);
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
} else {
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
}
} }
template <typename T> template <typename T>
...@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) { ...@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b); EXPECT_DOUBLE_EQ(a, b);
} }
inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) { inline void AssertSameDims(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), y.dtype());
ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs " ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
<< "y.shape [ " << ShapeToString(y) << "]"; << "y.shape [ " << ShapeToString(y) << "]";
} }
template <typename T, bool is_fp = is_floating_point_type<T>::value> template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector; struct Expector;
// Partial specialization for float and double. // Partial specialization for float and double.
template <typename T> template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<T, true> { struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const T &a, const T &b) { ExpectEqual(a, b); } static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
static void Equal(const Tensor &x, const Tensor &y) { static void Equal(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
ExpectEqual(a(i), b(i)); ExpectEqual(a(i), b(i));
} }
} }
static void Near(const Tensor &x, const Tensor &y, const double abs_err) { static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i; << " index = " << i;
...@@ -334,17 +340,18 @@ template <typename T> ...@@ -334,17 +340,18 @@ template <typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<T>::value, static_assert(is_floating_point_type<T>::value,
"T is not a floating point type"); "T is not a floating point type");
Expector<T>::Near(x, y, abs_err); Expector<T, T>::Near(x, y, abs_err);
} }
template <typename T> template <typename EXP_TYPE, typename RES_TYPE>
std::string ToString(const T &input) { void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
std::stringstream ss; static_assert(is_floating_point_type<EXP_TYPE>::value
ss << input; && is_floating_point_type<RES_TYPE>::value,
return ss.str(); "T is not a floating point type");
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
} }
template <DeviceType D> template <DeviceType D, typename T>
void BufferToImage(OpsTestNet &net, void BufferToImage(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net.Sync(); net.Sync();
} }
template <DeviceType D> template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet &net, void ImageToBuffer(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net, ...@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
......
...@@ -6,11 +6,29 @@ ...@@ -6,11 +6,29 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::CPU, half>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>); REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> { ...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
// TODO(chenghui): is it kind of a hack?
filter_shape[0] = input->shape()[1];
filter_shape[1] = input->shape()[0];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_(input, output); functor_(input, output);
return true; return true;
}; };
......
此差异已折叠。
...@@ -6,10 +6,16 @@ ...@@ -6,10 +6,16 @@
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu")
.TypeConstraint<float>("T")
.Build(),
ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu") REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册