diff --git a/mace/BUILD b/mace/BUILD index 1b95aae048469510fbe8c5d272602519689408e7..dbe38d6dad5658edc052ec77ec39be41ece8a7fc 100644 --- a/mace/BUILD +++ b/mace/BUILD @@ -23,3 +23,11 @@ config_setting( }, visibility = ["//visibility:public"], ) + +config_setting( + name = "is_profiling", + define_values = { + "profiling": "true", + }, + visibility = ["//visibility:public"], +) diff --git a/mace/core/BUILD b/mace/core/BUILD index 4b6bb68275188ef9c4b5f269ffe3982481c7162c..6f1af8a54e3dbab2f14d30c1b6116aabe1bf183e 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -7,7 +7,7 @@ package( licenses(["notice"]) # Apache 2.0 -load("//mace:mace.bzl", "if_android") +load("//mace:mace.bzl", "if_android", "if_profiling") cc_library( name = "opencl_runtime", @@ -19,7 +19,7 @@ cc_library( "runtime/opencl/cl2.hpp", "runtime/opencl/*.h", ]), - copts = ["-std=c++11"], + copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]), deps = [ ":logging", "@opencl_headers//:opencl20_headers", diff --git a/mace/core/half.h b/mace/core/half.h index dde806fb153f76982f26f1c9d6beb28eab516ab2..9df24bd43956aa56b5de833800d63cdda5281269 100644 --- a/mace/core/half.h +++ b/mace/core/half.h @@ -1098,7 +1098,7 @@ namespace half_float /// Conversion constructor. /// \param rhs float to convert - explicit half(float rhs) : data_(detail::float2half(rhs)) {} + half(float rhs) : data_(detail::float2half(rhs)) {} /// Conversion to single-precision. /// \return single precision value representing expression value diff --git a/mace/core/opencl_allocator.cc b/mace/core/opencl_allocator.cc index 3b393542281266a4564767e732ea703c4371e738..0c4cf8f0f87069d20650622c578308983d61560b 100644 --- a/mace/core/opencl_allocator.cc +++ b/mace/core/opencl_allocator.cc @@ -13,6 +13,7 @@ namespace { static cl_channel_type DataTypeToCLChannelType(const DataType t) { switch (t) { case DT_HALF: + return CL_HALF_FLOAT; case DT_FLOAT: return CL_FLOAT; case DT_INT8: @@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector &image_shape, cl_int error; cl::Image2D *cl_image = new cl::Image2D(OpenCLRuntime::Get()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format, image_shape[0], image_shape[1], 0, nullptr, &error); + MACE_CHECK(error == CL_SUCCESS); return cl_image; } diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 97be7cd11c92065fa8f8016d4ce7c18a6db5440c..e2e8936b62b46e164e1508ae08e2f998f8e12b32 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -6,6 +6,24 @@ namespace mace { + +OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {} + +OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name, + const DataType allowed) { + type_constraint_[attr_name] = allowed; + return *this; +} + +const std::string OpKeyBuilder::Build() { + static const std::vector type_order = {"T"}; + std::string key = op_name_; + for (auto type : type_order) { + key += type + "_" + DataTypeToString(type_constraint_[type]); + } + return key; +} + std::map *gDeviceTypeRegistry() { static std::map g_device_type_registry; return &g_device_type_registry; @@ -33,7 +51,14 @@ unique_ptr CreateOperator(const OperatorDef &operator_def, Workspace *ws, DeviceType type) { OperatorRegistry *registry = gDeviceTypeRegistry()->at(type); - return registry->Create(operator_def.type(), operator_def, ws); + const int dtype = ArgumentHelper::GetSingleArgument(operator_def, + "T", + static_cast(DT_FLOAT)); + return registry->Create(OpKeyBuilder(operator_def.type().data()) + .TypeConstraint("T", static_cast(dtype)) + .Build(), + operator_def, + ws); } OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) diff --git a/mace/core/operator.h b/mace/core/operator.h index 8625d2802d57aea8e64ca7f004b8fbe17885168f..6ee4a9c4d2c637fd7b60c070355c02e155db7a01 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -134,6 +134,29 @@ struct DeviceTypeRegisterer { } }; +class OpKeyBuilder { + public: + explicit OpKeyBuilder(const char *op_name); + + OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed); + + template + OpKeyBuilder &TypeConstraint(const char *attr_name); + + const std::string Build(); + + private: + std::string op_name_; + std::map type_constraint_; +}; + +template +OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { + return this->TypeConstraint(attr_name, DataTypeToEnum::value); +} + + + #define MACE_REGISTER_DEVICE_TYPE(type, registry_function) \ namespace { \ static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \ diff --git a/mace/core/registry.h b/mace/core/registry.h index 9a61ba1247f9a6227c69ed8e665bc7603b2f6c57..c92ebb123f03c8410129aa7ade5057e4eabe5195 100644 --- a/mace/core/registry.h +++ b/mace/core/registry.h @@ -106,10 +106,10 @@ class Registerer { } #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \ - MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) + MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__) #define MACE_REGISTER_CLASS(RegistryName, key, ...) \ - MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) + MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__) } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 4f95a9e7abd446ec8839b1998e13e5c7594dfd97..488b291d6df1061f95cccd0f89a492046eb4aa08 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() { return; } + cl_command_queue_properties properties = 0; +#ifdef __ENABLE_PROFILING + enable_profiling_ = true; + profiling_ev_.reset(new cl::Event()); + properties = CL_QUEUE_PROFILING_ENABLE; +#endif + // a context is like a "runtime link" to the device and platform; // i.e. communication is possible cl::Context context({gpu_device}); - cl_command_queue_properties properties = 0; - if (enable_profiling_) { - profiling_ev_.reset(new cl::Event()); - properties = CL_QUEUE_PROFILING_ENABLE; - } cl::CommandQueue command_queue(context, gpu_device, properties); instance = new OpenCLRuntime(context, gpu_device, command_queue); @@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() { } cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() { - MACE_CHECK(enable_profiling_, "should enable profiling first."); + MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first."); return profiling_ev_->getProfilingInfo(); } cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() { - MACE_CHECK(enable_profiling_, "should enable profiling first."); + MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first."); return profiling_ev_->getProfilingInfo(); } @@ -139,6 +141,7 @@ const std::map OpenCLRuntime::program_map_ = { {"addn", "addn.cl"}, {"batch_norm", "batch_norm.cl"}, + {"conv_2d", "conv_2d.cl"}, {"conv_2d_1x1", "conv_2d_1x1.cl"}, {"conv_2d_3x3", "conv_2d_3x3.cl"}, {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, diff --git a/mace/core/types.cc b/mace/core/types.cc index 08e5097464624fd345d1753bfc73544a4e886f5f..5ecb5410541e36b27f83fa4e46d56956aacc1f2f 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) { } } +std::string DataTypeToString(const DataType dt) { + static std::map dtype_string_map = { + {DT_FLOAT, "DT_FLOAT"}, + {DT_HALF, "DT_HALF"}, + {DT_DOUBLE, "DT_DOUBLE"}, + {DT_UINT8, "DT_UINT8"}, + {DT_INT8, "DT_INT8"}, + {DT_INT32, "DT_INT32"}, + {DT_UINT32, "DT_UINT32"}, + {DT_UINT16, "DT_UINT16"}, + {DT_INT64, "DT_INT64"}, + {DT_BOOL, "DT_BOOL"}, + {DT_STRING, "DT_STRING"} + }; + MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type"; + return dtype_string_map[dt]; +} size_t GetEnumTypeSize(const DataType dt) { switch (dt) { diff --git a/mace/core/types.h b/mace/core/types.h index 1fb6c805d3251fe058cbecc7f93b9d771e8a05e9..616e40b2aeba81a1eca0ddbe28b7acf4c56b2b0a 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt); size_t GetEnumTypeSize(const DataType dt); +std::string DataTypeToString(const DataType dt); + template struct IsValidDataType; diff --git a/mace/dsp/BUILD b/mace/dsp/BUILD index ca0183822ff2c6313bf8b1f8faa112becb3ef4b3..cbe9f834a40dba000bf7da74320dae422bdf99e1 100644 --- a/mace/dsp/BUILD +++ b/mace/dsp/BUILD @@ -24,7 +24,7 @@ cc_library( "*.h", "hexagon/*.h", ]), - copts = ["-std=c++11"], + copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"], deps = [ "//mace/proto:cc_proto", "//mace/core:core", @@ -36,7 +36,7 @@ cc_test( name = "dsp_test", testonly = 1, srcs = glob(["*_test.cc"]), - copts = ["-std=c++11"], + copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"], linkopts = if_android([ "-ldl", "-lm", @@ -52,7 +52,7 @@ cc_test( name = "dsp_op_test", testonly = 1, srcs = glob(["test/*_test.cc"]), - copts = ["-std=c++11"], + copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"], linkopts = if_android([ "-ldl", "-lm", @@ -64,3 +64,21 @@ cc_test( "//mace/kernels:kernels", ], ) + +cc_binary( + name = "mace_dsp_run", + srcs = [ + "tool/mace_dsp_run.cc", + ], + copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"], + linkopts = if_android([ + "-ldl", + "-lm", + ]), + linkstatic = 1, + deps = [ + ":dsp", + "//mace/kernels:kernels", + "//mace/utils:command_line_flags", + ], +) \ No newline at end of file diff --git a/mace/dsp/hexagon_control_wrapper.cc b/mace/dsp/hexagon_control_wrapper.cc index 7c65e7e5212f0797e847b22fd67640bb58854f41..3f25a5d78d208d3d10abbd18ae6234ae2033d2ed 100644 --- a/mace/dsp/hexagon_control_wrapper.cc +++ b/mace/dsp/hexagon_control_wrapper.cc @@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) { } // input info - const InputInfo& input_info = net_def.input_info()[0]; - input_shape_.insert(input_shape_.begin(), - input_info.dims().begin(), input_info.dims().end()); - while (input_shape_.size() < 4) { - input_shape_.insert(input_shape_.begin(), 1); + num_inputs_ = 0; + for (const InputInfo &input_info: net_def.input_info()) { + vector input_shape; + input_shape.insert(input_shape.begin(), + input_info.dims().begin(), input_info.dims().end()); + while (input_shape.size() < 4) { + input_shape.insert(input_shape.begin(), 1); + } + input_shapes_.push_back(input_shape); + input_data_types_.push_back(input_info.data_type()); + num_inputs_ += 1; } - input_data_type_ = input_info.data_type(); // output info - const OutputInfo& output_info = net_def.output_info()[0]; - output_shape_.insert(output_shape_.begin(), - output_info.dims().begin(), output_info.dims().end()); - while (output_shape_.size() < 4) { - output_shape_.insert(output_shape_.begin(), 1); + num_outputs_ = 0; + for (const OutputInfo &output_info: net_def.output_info()) { + vector output_shape; + output_shape.insert(output_shape.begin(), + output_info.dims().begin(), output_info.dims().end()); + while (output_shape.size() < 4) { + output_shape.insert(output_shape.begin(), 1); + } + output_shapes_.push_back(output_shape); + output_data_types_.push_back(output_info.data_type()); + num_outputs_ += 1; } - output_data_type_ = output_info.data_type(); bool res = hexagon_nn_prepare(nn_id_) == 0; return res; @@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() { hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME); } +bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) { + LOG(INFO) << "Execute graph: " << nn_id_; + // single input and single output + MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num"); + MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num"); + output_tensor->SetDtype(output_data_types_[0]); + output_tensor->Resize(output_shapes_[0]); + vector output_shape(4); + uint32_t output_bytes; + int res = hexagon_nn_execute(nn_id_, + input_tensor.shape()[0], + input_tensor.shape()[1], + input_tensor.shape()[2], + input_tensor.shape()[3], + reinterpret_cast( + input_tensor.raw_data()), + input_tensor.raw_size(), + &output_shape[0], + &output_shape[1], + &output_shape[2], + &output_shape[3], + reinterpret_cast( + output_tensor->raw_mutable_data()), + output_tensor->raw_size(), + &output_bytes); + + MACE_ASSERT(output_shape == output_shapes_[0], + "wrong output shape inferred"); + MACE_ASSERT(output_bytes == output_tensor->raw_size(), + "wrong output bytes inferred."); + return res == 0; +}; + +bool HexagonControlWrapper::ExecuteGraphNew(const vector &input_tensors, + vector *output_tensors) { + LOG(INFO) << "Execute graph new: " << nn_id_; + int num_inputs = input_tensors.size(); + int num_outputs = output_tensors->size(); + MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num"); + MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num"); + + hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs]; + hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs]; + + for (int i = 0; i < num_inputs; ++i) { + vector input_shape = input_tensors[i].shape(); + inputs[i].batches = input_shape[0]; + inputs[i].height = input_shape[1]; + inputs[i].width = input_shape[2]; + inputs[i].depth = input_shape[3]; + inputs[i].data = const_cast( + reinterpret_cast(input_tensors[i].raw_data())); + inputs[i].dataLen = input_tensors[i].raw_size(); + inputs[i].data_valid_len = input_tensors[i].raw_size(); + inputs[i].unused = 0; + } + + for (int i = 0; i < num_outputs; ++i) { + (*output_tensors)[i].SetDtype(output_data_types_[i]); + (*output_tensors)[i].Resize(output_shapes_[i]); + outputs[i].data = reinterpret_cast( + (*output_tensors)[i].raw_mutable_data()); + outputs[i].dataLen = (*output_tensors)[i].raw_size(); + } + + int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs, + outputs, num_outputs); + + for (int i = 0; i < num_outputs; ++i) { + vector output_shape {outputs[i].batches, outputs[i].height, + outputs[i].width, outputs[i].depth}; + MACE_ASSERT(output_shape == output_shapes_[i], + "wrong output shape inferred"); + MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(), + "wrong output bytes inferred."); + } + + delete [] inputs; + delete [] outputs; + return res == 0; +}; + +bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, + Tensor *output_tensor) { + vector input_tensors(3); + vector output_tensors(3); + input_tensors[0].SetDtype(DT_UINT8); + output_tensors[0].SetDtype(DT_UINT8); + input_tensors[0].ResizeLike(input_tensor); + input_tensors[1].Resize({1, 1, 1, 1}); + float *min_in_data = input_tensors[1].mutable_data(); + input_tensors[2].Resize({1, 1, 1, 1}); + float *max_in_data = input_tensors[2].mutable_data(); + quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data); + if (!ExecuteGraphNew(input_tensors, &output_tensors)) { + return false; + } + + output_tensor->ResizeLike(output_tensors[0]); + + const float *min_out_data = output_tensors[1].data(); + const float *max_out_data = output_tensors[2].data(); + quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor); + return true; +} + } // namespace mace \ No newline at end of file diff --git a/mace/dsp/hexagon_control_wrapper.h b/mace/dsp/hexagon_control_wrapper.h index a67e9903b7a42f6866fe1e1a63177586bfdfd326..fa9f47b1bc7fbbb08e58b70ab15f5cb8e884f847 100644 --- a/mace/dsp/hexagon_control_wrapper.h +++ b/mace/dsp/hexagon_control_wrapper.h @@ -7,6 +7,7 @@ #include "mace/dsp/hexagon/hexagon_controller.h" #include "mace/dsp/hexagon_nn_ops.h" +#include "mace/dsp/util/quantize.h" #include "mace/core/common.h" #include "mace/core/tensor.h" #include "mace/proto/mace.pb.h" @@ -23,35 +24,10 @@ class HexagonControlWrapper { bool Finalize(); bool SetupGraph(const NetDef& net_def); bool SetupGraph(const std::string &model_file); - bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) { - LOG(INFO) << "Execute graph: " << nn_id_; - output_tensor->SetDtype(output_data_type_); - output_tensor->Resize(output_shape_); - vector output_shape(4); - uint32_t output_bytes; - int res = hexagon_nn_execute(nn_id_, - input_tensor.shape()[0], - input_tensor.shape()[1], - input_tensor.shape()[2], - input_tensor.shape()[3], - reinterpret_cast( - input_tensor.raw_data()), - input_tensor.raw_size(), - &output_shape[0], - &output_shape[1], - &output_shape[2], - &output_shape[3], - reinterpret_cast( - output_tensor->raw_mutable_data()), - output_tensor->raw_size(), - &output_bytes); - - MACE_ASSERT(output_shape == output_shape_, - "wrong output shape inferred"); - MACE_ASSERT(output_bytes == output_tensor->raw_size(), - "wrong output bytes inferred."); - return res == 0; - }; + bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor); + bool ExecuteGraphNew(const vector& input_tensors, + vector *output_tensors); + bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor); bool TeardownGraph(); void PrintLog(); @@ -70,11 +46,14 @@ class HexagonControlWrapper { int nn_id_; Serializer serializer_; - - vector input_shape_; - vector output_shape_; - DataType input_data_type_; - DataType output_data_type_; + Quantizer quantizer_; + + vector> input_shapes_; + vector> output_shapes_; + vector input_data_types_; + vector output_data_types_; + uint32_t num_inputs_; + uint32_t num_outputs_; DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); }; diff --git a/mace/dsp/hexagon_control_wrapper_test.cc b/mace/dsp/hexagon_control_wrapper_test.cc index 48a743c69ecdb09bb09ca95412fe8852a86a55eb..b34e028c16b80fdfe9c280a3edf353fa9e040ec6 100644 --- a/mace/dsp/hexagon_control_wrapper_test.cc +++ b/mace/dsp/hexagon_control_wrapper_test.cc @@ -8,7 +8,7 @@ using namespace mace; -TEST(HexagonControlerWrapper, GetVersion) { +TEST(HexagonControlerWrapper, InputFloat) { testing::internal::LogToStderr(); HexagonControlWrapper wrapper; VLOG(0) << "version: " << wrapper.GetVersion(); @@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) { wrapper.ResetPerfInfo(); timeval tv1, tv2; gettimeofday(&tv1, NULL); - int round = 2; + int round = 10; for (int i = 0; i < round; ++i) { VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); } @@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) { } std::cout << std::endl; + VLOG(0) << wrapper.TeardownGraph(); + wrapper.Finalize(); +} + +TEST(HexagonControlerWrapper, PreQuantize) { + testing::internal::LogToStderr(); + HexagonControlWrapper wrapper; + VLOG(0) << "version: " << wrapper.GetVersion(); + wrapper.Init(); + wrapper.SetDebugLevel(0); + wrapper.Config(); + VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb"); + wrapper.PrintGraph(); + + Tensor input_tensor; + Tensor output_tensor; + input_tensor.Resize({1, 480, 480, 3}); + float *input_data = input_tensor.mutable_data(); + for (int i = 0; i < input_tensor.size(); ++i) { + input_data[i] = i % 256; + } + + wrapper.ResetPerfInfo(); + timeval tv1, tv2; + gettimeofday(&tv1, NULL); + int round = 10; + for (int i = 0; i < round; ++i) { + VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor); + } + gettimeofday(&tv2, NULL); + VLOG(0) << "avg duration: " + << ((tv2.tv_sec - tv1.tv_sec) * 1000 + + (tv2.tv_usec - tv1.tv_usec) / 1000) / + round; + + wrapper.GetPerfInfo(); + wrapper.PrintLog(); + + const float *output_data = output_tensor.data(); + for (int i = 0; i < output_tensor.size(); ++i) { + std::cout << output_data[i] << " "; + } + std::cout << std::endl; + VLOG(0) << wrapper.TeardownGraph(); wrapper.Finalize(); } \ No newline at end of file diff --git a/mace/dsp/test/quantized_resize_bilinear_test.cc b/mace/dsp/test/quantized_resize_bilinear_test.cc index a1f26abad75d210e546817487123cf76b646532c..12a2f8d34b94aeb21c8d3507be4ab4b545c26c2e 100644 --- a/mace/dsp/test/quantized_resize_bilinear_test.cc +++ b/mace/dsp/test/quantized_resize_bilinear_test.cc @@ -5,6 +5,7 @@ #include "mace/dsp/hexagon_control_wrapper.h" #include "gtest/gtest.h" +#define RESIZE_BILINEAR_TEST_CHANNELS 128 using namespace mace; static NetDef BuildNetDef() { @@ -17,7 +18,7 @@ static NetDef BuildNetDef() { input_op->set_type("INPUT"); input_op->set_node_id(0); input_op->set_padding(0); - input_op->add_out_max_byte_size(1000); + input_op->add_out_max_byte_size(1200); // relu op OperatorDef *resize_bilinear_op = net.add_op(); @@ -45,7 +46,7 @@ static NetDef BuildNetDef() { input_node_input = resize_bilinear_op->add_node_input(); input_node_input->set_node_id(12); input_node_input->set_output_port(0); - resize_bilinear_op->add_out_max_byte_size(1000); + resize_bilinear_op->add_out_max_byte_size(1200); resize_bilinear_op->add_out_max_byte_size(1000); resize_bilinear_op->add_out_max_byte_size(1000); @@ -64,8 +65,8 @@ static NetDef BuildNetDef() { new_dim_tensor->add_dims(2); new_dim_tensor->set_data_type(DataType::DT_INT32); new_dim_tensor->set_node_id(10); - new_dim_tensor->add_int32_data(1); - new_dim_tensor->add_int32_data(1); + new_dim_tensor->add_int32_data(2); + new_dim_tensor->add_int32_data(2); TensorProto *input_min_tensor = net.add_tensors(); input_min_tensor->set_name("input_min"); @@ -86,20 +87,20 @@ static NetDef BuildNetDef() { input_info->set_name("input_node"); input_info->set_node_id(0); input_info->add_dims(1); - input_info->add_dims(2); - input_info->add_dims(2); - input_info->add_dims(128); + input_info->add_dims(3); + input_info->add_dims(3); + input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS); input_info->set_data_type(DataType::DT_UINT8); - input_info->set_max_byte_size(1000); + input_info->set_max_byte_size(1200); OutputInfo *output_info = net.add_output_info(); output_info->set_name("output_node"); output_info->set_node_id(1); output_info->add_dims(1); - output_info->add_dims(1); - output_info->add_dims(1); - output_info->add_dims(128); + output_info->add_dims(2); + output_info->add_dims(2); + output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS); output_info->set_data_type(DataType::DT_UINT8); - output_info->set_max_byte_size(1000); + output_info->set_max_byte_size(1200); return net; } @@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) { Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU); Tensor input_tensor(cpu_allocator, DT_UINT8); Tensor output_tensor(cpu_allocator, DT_UINT8); - input_tensor.Resize({1, 2, 2, 128}); - output_tensor.Resize({1, 1, 1, 128}); + input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS}); + output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS}); uint8_t *input_data = input_tensor.mutable_data(); const uint8_t *output_data = output_tensor.data(); - for (int c = 0; c < 128; ++c) { - input_data[c] = input_data[c + 128] = input_data[c + 256] - = input_data[c + 384] = (uint8_t)c; + for (int wh = 0; wh < 9; ++wh) { + for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) { + input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh; + } } VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); wrapper.PrintLog(); - for (int i = 0; i < output_tensor.size(); ++i) { - EXPECT_EQ(i, output_data[i]); + vector expected {9, 8, 5, 3}; + for (int i = 0; i < 4; ++i) { + for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) + EXPECT_EQ(expected[i], + output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]); } std::cout << std::endl; diff --git a/mace/dsp/tool/mace_dsp_run.cc b/mace/dsp/tool/mace_dsp_run.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c8e7afae7acfccec8418b4b63da75d0a6d47af4 --- /dev/null +++ b/mace/dsp/tool/mace_dsp_run.cc @@ -0,0 +1,109 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * mace_dsp_run --model=mobi_mace.pb \ + * --input_shape=1,3,224,224 \ + * --input_file=input_data \ + * --output_file=mace.out + */ +#include +#include +#include "mace/dsp/hexagon_control_wrapper.h" +#include "mace/core/net.h" +#include "mace/utils/command_line_flags.h" + +using namespace std; +using namespace mace; + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +int main(int argc, char **argv) { + string model_file; + string input_shape; + string input_file; + string output_file; + int round = 1; + + std::vector flag_list = { + Flag("model", &model_file, "model file name"), + Flag("input_shape", &input_shape, "input shape, separated by comma"), + Flag("input_file", &input_file, "input file name"), + Flag("output_file", &output_file, "output file name"), + Flag("round", &round, "round"), + }; + + string usage = Flags::Usage(argv[0], flag_list); + const bool parse_result = Flags::Parse(&argc, argv, flag_list); + + if (!parse_result) { + LOG(ERROR) << usage; + return -1; + } + + VLOG(0) << "model: " << model_file << std::endl + << "input_shape: " << input_shape << std::endl + << "input_file: " << input_file << std::endl + << "output_file: " << output_file << std::endl + << "round: " << round << std::endl; + + vector shape; + ParseShape(input_shape, &shape); + + // load input + Tensor input_tensor; + input_tensor.Resize(shape); + float *input_data = input_tensor.mutable_data(); + ifstream in_file(input_file, ios::in | ios::binary); + in_file.read(reinterpret_cast(input_data), + input_tensor.size() * sizeof(float)); + in_file.close(); + + // execute + HexagonControlWrapper wrapper; + VLOG(0) << "version: " << wrapper.GetVersion(); + wrapper.Init(); + wrapper.SetDebugLevel(0); + wrapper.Config(); + VLOG(0) << wrapper.SetupGraph(model_file); + wrapper.PrintGraph(); + + Tensor output_tensor; + timeval tv1, tv2; + gettimeofday(&tv1, NULL); + for (int i = 0; i < round; ++i) { + VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); + } + gettimeofday(&tv2, NULL); + cout << "avg duration: " + << ((tv2.tv_sec - tv1.tv_sec) * 1000 + + (tv2.tv_usec - tv1.tv_usec) / 1000) / + round + << endl; + + wrapper.GetPerfInfo(); + wrapper.PrintLog(); + VLOG(0) << wrapper.TeardownGraph(); + wrapper.Finalize(); + + // save output + ofstream out_file(output_file, ios::binary); + out_file.write((const char *) (output_tensor.data()), + output_tensor.size() * sizeof(float)); + out_file.flush(); + out_file.close(); +} \ No newline at end of file diff --git a/mace/dsp/util/BUILD b/mace/dsp/util/BUILD index 4a75e104fccca2214cd0ffbf014a8c224614d9f4..e5730b285116454ca7c15d5dd08110d3da7c3f42 100644 --- a/mace/dsp/util/BUILD +++ b/mace/dsp/util/BUILD @@ -20,7 +20,7 @@ cc_library( hdrs = glob([ "*.h", ]), - copts = ["-std=c++11"], + copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"], deps = [ "//mace/core:core", ], diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index b47ef7e73f83a780fd4baf5aa729e980732da7ed..6195f324da7731cf2a7374ded017e734ce92faf8 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -10,15 +10,23 @@ namespace mace { namespace kernels { -template -struct AddNFunctor { - void operator()(std::vector &input_tensors, Tensor *output_tensor) { +struct AddNFunctorBase {}; + +template +struct AddNFunctor : AddNFunctorBase { + void operator()(const std::vector &input_tensors, + Tensor *output_tensor) { + output_tensor->ResizeLike(input_tensors[0]); Tensor::MappingGuard output_map(output_tensor); index_t size = input_tensors[0]->size(); T *output_ptr = output_tensor->mutable_data(); memset(output_ptr, 0, size * sizeof(T)); int n = input_tensors.size(); for (int i = 0; i < n; ++i) { + MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0)); + MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1)); + MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2)); + MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3)); Tensor::MappingGuard input_map(input_tensors[i]); const T *input_ptr = input_tensors[i]->data(); for (index_t j = 0; j < size; ++j) { @@ -28,15 +36,17 @@ struct AddNFunctor { } }; -template<> +template <> void AddNFunctor::operator()( - std::vector &input_tensors, Tensor *output_tensor); + const std::vector &input_tensors, Tensor *output_tensor); -template<> -void AddNFunctor::operator()( - std::vector &inputs, Tensor *output); +template +struct AddNFunctor : AddNFunctorBase { + void operator()(const std::vector &input_tensors, + Tensor *output_tensor); +}; } // namespace kernels } // namespace mace -#endif // MACE_KERNELS_ADDN_H_ \ No newline at end of file +#endif // MACE_KERNELS_ADDN_H_ diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index b95d4895bc3963493ef55eb31e776aa4ca732dc0..36b2925742ce6214d3d4d41146221750f47a35b2 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -28,9 +28,10 @@ struct BatchNormFunctor { // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } // new_offset = \offset - mean * common_val; // Y = new_scale * X + new_offset; - const index_t n = input->dim(0); - const index_t channel = input->dim(1); - const index_t sample_size = input->dim(2) * input->dim(3); + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard scale_mapper(scale); @@ -48,19 +49,26 @@ struct BatchNormFunctor { const T *epsilon_ptr = epsilon->data(); T *output_ptr = output->mutable_data(); + vector new_scale(channels); + vector new_offset(channels); + #pragma omp parallel for - for (index_t c = 0; c < channel; ++c) { - T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); - T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale; - index_t pos = c * sample_size; + for (index_t c = 0; c < channels; ++c) { + new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); + new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + } + + index_t pos = 0; - for (index_t i = 0; i < n; ++i) { - const T *input_sample_ptr = input_ptr + pos; - T *output_sample_ptr = output_ptr + pos; - for (index_t j = 0; j < sample_size; ++j) { - output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset; +#pragma omp parallel for + for (index_t n = 0; n < batch; ++n) { + for (index_t h = 0; h < height; ++h) { + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < channels; ++c) { + output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; + ++pos; + } } - pos += channel * sample_size; } } } @@ -76,15 +84,16 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output); -template <> -void BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const Tensor *epsilon, - Tensor *output); +template +struct BatchNormFunctor { + void operator()(const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output); +}; } // namepsace kernels } // namespace mace diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index a717c6a48513eb075ae4b36124213a109a7f4786..e9a41cfcafef011da308a4df81b3dbc79874bfb2 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -11,13 +11,23 @@ namespace mace { namespace kernels { +struct Conv2dFunctorBase { + Conv2dFunctorBase(const int *strides, + const Padding &paddings, + const int *dilations) + : strides_(strides), dilations_(dilations), paddings_(paddings) {} + + const int *strides_; // [stride_h, stride_w] + const int *dilations_; // [dilation_h, dilation_w] + Padding paddings_; +}; + template -struct Conv2dFunctor { - Conv2dFunctor() {} +struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(const int *strides, const Padding &paddings, const int *dilations) - : strides_(strides), dilations_(dilations), paddings_(paddings) {} + : Conv2dFunctorBase(strides, paddings, dilations) {} void operator()(const Tensor *input, const Tensor *filter, @@ -76,9 +86,10 @@ struct Conv2dFunctor { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { for (int c = 0; c < channels; ++c) { - T bias_channel = bias_data ? bias_data[c] : 0; + T bias_channel = 0.0f; + if (bias) bias_channel = bias_data[c]; *output_data = bias_channel; - T sum = 0; + T sum = 0.0f; const T *filter_ptr = filter_data + c; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { @@ -113,9 +124,6 @@ struct Conv2dFunctor { } - const int *strides_; // [stride_h, stride_w] - const int *dilations_; // [dilation_h, dilation_w] - Padding paddings_; }; template<> @@ -123,11 +131,19 @@ void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output); -template<> -void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output); + +template +struct Conv2dFunctor : Conv2dFunctorBase { + Conv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : Conv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output); +}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/fused_conv_2d.h b/mace/kernels/fused_conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..4daf28e63599497ea5af99ae7ef1a452dd838465 --- /dev/null +++ b/mace/kernels/fused_conv_2d.h @@ -0,0 +1,71 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_FUSED_CONV_2D_H_ +#define MACE_KERNELS_FUSED_CONV_2D_H_ + +#include "mace/core/tensor.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/conv_2d.h" + +namespace mace { +namespace kernels { + +struct FusedConv2dFunctorBase { + FusedConv2dFunctorBase(const int *strides, + const Padding &paddings, + const int *dilations) + : strides_(strides), dilations_(dilations), paddings_(paddings) {} + + const int *strides_; // [stride_h, stride_w] + const int *dilations_; // [dilation_h, dilation_w] + Padding paddings_; +}; + +template +struct FusedConv2dFunctor : FusedConv2dFunctorBase { + FusedConv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : FusedConv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + Conv2dFunctor(strides_, paddings_, dilations_)(input, filter, bias, output); + T *output_data = output->mutable_data(); + + T zero_value; + if (DataTypeToEnum::value == DataType::DT_HALF) { + zero_value = half_float::half_cast(0.0f); + } else { + zero_value = 0; + } + auto output_size = output->size(); + for (int n = 0; n < output_size; ++n) { + *output_data = *output_data < 0 ? zero_value : *output_data; + output_data++; + } + } + +}; + +template +struct FusedConv2dFunctor : FusedConv2dFunctorBase { + FusedConv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : FusedConv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_FUSED_CONV_2D_H_ diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index d7ff94864ea3ba7469cea561558e39b41624db1f..33a2bec5bdfecb985dec1f20d3a0b01f2a245fd2 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -10,7 +10,7 @@ namespace kernels { template <> void AddNFunctor::operator()( - std::vector &input_tensors, Tensor *output_tensor) { + const std::vector &input_tensors, Tensor *output_tensor) { // TODO: neon mem copy index_t size = output_tensor->size(); float *output_ptr = output_tensor->mutable_data(); @@ -51,4 +51,4 @@ void AddNFunctor::operator()( }; } // namespace kernels -} // namespace mace \ No newline at end of file +} // namespace mace diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc index 0f9162349c03398c5301b152a9c87048431caa5f..76868335d12500623cc08fff5d0cfae70761cff9 100644 --- a/mace/kernels/neon/pooling_neon.cc +++ b/mace/kernels/neon/pooling_neon.cc @@ -58,19 +58,27 @@ void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor) { + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape(4); + filter_shape[0] = input_tensor->shape()[1]; + filter_shape[1] = input_tensor->shape()[1]; + filter_shape[2] = kernels_[0]; + filter_shape[3] = kernels_[1]; + + kernels::CalcPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), this->dilations_, + strides_, this->padding_, output_shape.data(), + paddings.data()); + output_tensor->Resize(output_shape); + const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); #ifdef __COPY_MAKE_PADDING Tensor padded_input; - ConstructInputWithPadding(input_tensor, paddings, &padded_input); + ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input); input = padded_input.data(); input_shape = padded_input.shape().data(); #endif @@ -80,17 +88,17 @@ void PoolingFunctor::operator()( // kernel_size: 2x2, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && @@ -98,17 +106,17 @@ void PoolingFunctor::operator()( // kernel_size: 3x3, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else { // not implement yet diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 6c5106db25c0f12cb625b6e5e0c80c0497541804..31cd19104f43082e10fa4fdef77e6d02ceeb67cd 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -5,52 +5,83 @@ #include "mace/kernels/addn.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { - index_t element_size = input0->NumElements(); - index_t blocks = (element_size + 3) / 4; +template +static void AddN(const std::vector &input_tensors, + Tensor *output) { + if (input_tensors.size() > 4) { + MACE_NOT_IMPLEMENTED; + } + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); - const uint32_t gws = blocks; + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_pixels = channel_blocks * width; + const index_t batch_height_pixels = batch * height; auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype())); - auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); + auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); uint32_t idx = 0; - addn_kernel.setArg(idx++, *(static_cast(input0->buffer()))); - addn_kernel.setArg(idx++, *(static_cast(input1->buffer()))); - addn_kernel.setArg(idx++, static_cast(element_size)); - addn_kernel.setArg(idx++, *(static_cast(output->buffer()))); + for (auto input : input_tensors) { + addn_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + } + addn_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( addn_kernel, cl::NullRange, - cl::NDRange(gws), - cl::NDRange(lws), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); + cl::NDRange(width_pixels, batch_height_pixels), + cl::NDRange(64, 16), // TODO fix this + nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); + MACE_CHECK(error == CL_SUCCESS) << "error code: " << error; } -template<> -void AddNFunctor::operator()(std::vector &input_tensors, - Tensor *output_tensor) { - - if (input_tensors.empty() || input_tensors.front() == nullptr) { - return; - } +template +void AddNFunctor::operator()( + const std::vector &input_tensors, Tensor *output_tensor) { size_t size = input_tensors.size(); + MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); + + const index_t batch = input_tensors[0]->dim(0); + const index_t height = input_tensors[0]->dim(1); + const index_t width = input_tensors[0]->dim(2); + const index_t channels = input_tensors[0]->dim(3); - switch (size) { - case 2:Add2(input_tensors[0], input_tensors[1], output_tensor); - break; - default:MACE_NOT_IMPLEMENTED; + for (int i = 1; i < size; ++i) { + MACE_CHECK_NOTNULL(input_tensors[i]); + MACE_CHECK(batch == input_tensors[i]->dim(0)); + MACE_CHECK(height == input_tensors[i]->dim(1)); + MACE_CHECK(width == input_tensors[i]->dim(2)); + MACE_CHECK(channels == input_tensors[i]->dim(3)); } + + std::vector output_shape = input_tensors[0]->shape(); + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output_tensor->ResizeImage(output_shape, output_image_shape); + + AddN(input_tensors, output_tensor); }; +template +struct AddNFunctor; + +template +struct AddNFunctor; + } // namespace kernels -} // namespace mace +} // namespace mace diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index c7cd37e3ec7e6c1e0dbe31cf335bb105869e35c2..c17286895a8868732ada5608d9454cae31cdd746 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -11,8 +11,8 @@ namespace mace { namespace kernels { -template <> -void BatchNormFunctor::operator()( +template +void BatchNormFunctor::operator()( const Tensor *input, const Tensor *scale, const Tensor *offset, @@ -21,35 +21,39 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output) { - index_t pixel_size = input->dim(2) * input->dim(3); - index_t blocks = (pixel_size + 3) / 4; + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); - const uint32_t gws[3] = {static_cast(input->dim(0)), - static_cast(input->dim(1)), - static_cast(blocks)}; + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - const std::vector lws = {1, 1, kwg_size}; + const std::vector lws = {1, kwg_size, 1}; uint32_t idx = 0; - bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(offset->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(offset->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); bm_kernel.setArg(idx++, *(static_cast(epsilon->buffer()))); - bm_kernel.setArg(idx++, static_cast(pixel_size)); - bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr); - bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr); + bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); auto params_generator = [&kwg_size]()->std::vector> { - return {{1, 1, 64}, + return {{8, 128, 1}, //SNPE size + {1, 1, 64}, {1, 1, 128}, {1, kwg_size/16, 16}, {1, kwg_size/32, 32}, @@ -80,5 +84,9 @@ void BatchNormFunctor::operator()( func); } +template +struct BatchNormFunctor; +template +struct BatchNormFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 511e4598309561a5a453113784db9de4d933399b..f3af3d22622bd5e893347d958da76dbec71a450a 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -24,8 +24,13 @@ void BufferToImageFunctor::operator()(Tensor *buffer, } std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); + if (buffer->dtype() == image->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + } auto runtime = OpenCLRuntime::Get(); string kernel_name; switch (type) { diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 55c8d0bf5d5ec32053c06eb9724e21156c99e35c..a93099303f8d2e6c6896c61c4a1978be1c222bbf 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,20 +1,33 @@ #include -// Supported data type: half/float -__kernel void add2(__global const DATA_TYPE *input0, - __global const DATA_TYPE *input1, - __private const int size, - __global DATA_TYPE *output) { - int idx = get_global_id(0); +__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t input1, +#if INPUT_NUM > 2 + __read_only image2d_t input2, +#endif +#if INPUT_NUM > 3 + __read_only image2d_t input3, +#endif + __write_only image2d_t output) { + const int w = get_global_id(0); + const int hb = get_global_id(1); - if (idx + 4 > size) { - for(; idx < size; ++idx) { - *(output+idx) = *(input0+idx) + *(input1+idx); - } - } else { - VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0); - VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1); - vstore4(in_data0+in_data1, idx, output); - } + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb)); + DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb)); + DATA_TYPE4 out = in0 + in1; + +#if INPUT_NUM > 2 + DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb)); + out = out + in2; +#endif + +#if INPUT_NUM > 3 + DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb)); + out = out + in3; +#endif + + WRITE_IMAGET(output, (int2)(w, hb), out); } diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index e6a52d491972b6efe5ec3ecec3f26792d66b76a6..d0ad2e2aca77a2cc0fb7a51a8a4671060842b077 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,43 +1,28 @@ #include // Supported data types: half/float -void kernel batch_norm(global const DATA_TYPE *input, - global const DATA_TYPE *scale, - global const DATA_TYPE *offset, - global const DATA_TYPE *mean, - global const DATA_TYPE *var, - global const DATA_TYPE *epsilon, - private const int pixels, - global DATA_TYPE *output, - __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale, - __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) { - const int batch = get_global_id(0); - const int channel = get_global_id(1); - const int channels = get_global_size(1); - const int pixel_offset = get_global_id(2); - const int local_channel = get_local_id(1); - const int local_pixel_idx = get_local_id(2); +__kernel void batch_norm(__read_only image2d_t input, + __read_only image2d_t scale, + __read_only image2d_t offset, + __read_only image2d_t mean, + __read_only image2d_t var, + __global const DATA_TYPE *epsilon, + __write_only image2d_t output) { + const int ch_blk = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); + const int width = get_global_size(1); - if(local_pixel_idx == 0) { - new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon)); - new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x); - } + DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0)); - barrier(CLK_LOCAL_MEM_FENCE); + DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon)); + DATA_TYPE4 new_offset = offset_value - mean_value * new_scale; - const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; - const DATA_TYPE *input_ptr = input + image_offset; - DATA_TYPE *output_ptr = output + image_offset; - const int end = (batch * channels + channel + 1) * pixels; - if ((image_offset+4) > end) { - for (int i = image_offset; i < end; ++i) { - *output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x; - ++input_ptr; - ++output_ptr; - } - } else { - VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr); - values = values * new_scale[local_channel] + new_offset[local_channel]; - vstore4(values, 0, output_ptr); - } -} + const int pos = ch_blk * width + w; + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); + DATA_TYPE4 out = in * new_scale + new_offset; + WRITE_IMAGET(output, (int2)(pos, hb), out); +} diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 7c156d8d2ebd44012a3ef0aab04ebcb3b549aee1..499c8164ddc3a0c5158c97e70c6f6ec55f0ccd87 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -14,4 +14,11 @@ #define CMD_TYPE_STR(cmd, type) cmd##type #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type) +#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4) +#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) +#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) + + +__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl new file mode 100644 index 0000000000000000000000000000000000000000..e5ddb3d78fd5d5176123e9b0ab5e4e460e035314 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -0,0 +1,148 @@ +#include + +__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int in_ch_blks, + __private const int out_height, + __private const int out_width, + __private const int filter_height, + __private const int filter_width, + __private const int padding_top, + __private const int padding_left) { + const int out_ch_blk = get_global_id(0); + const int out_w_blk = get_global_id(1); + const int out_w_blks = get_global_size(1); + const int out_hb = get_global_id(2); + const int rounded_in_ch = in_ch_blks * 4; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef BIAS + DATA_TYPE4 out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + +#if STRIDE == 1 + int in_width0 = out_w_blk - padding_left; + int in_width1 = in_width0 + out_w_blks; + int in_width2 = in_width1 + out_w_blks; + int in_width3 = in_width2 + out_w_blks; + const int height_idx = (out_hb % out_height) - padding_top; +#else + int in_width0 = out_w_blk * 2 - padding_left; + int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; + int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; + int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; + const int height_idx = (out_hb % out_height) * 2 - padding_top; +#endif + + const int batch_idx = (out_hb / out_height) * in_height; + + DATA_TYPE4 in0, in1, in2, in3; + DATA_TYPE4 weights0, weights1, weights2, weights3; + int in_idx, in_width_idx; + // Unrolling this loop hurt perfmance + for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { + + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); + + for (short width_idx = 0; width_idx < filter_width; ++width_idx) { + in_idx = in_ch_blk * in_width; + int in_width_value; +#define READ_INPUT(i) \ + in_width_value = in_width##i + width_idx; \ + in_width_value = select(in_idx + in_width_value, \ + -1, \ + (in_width_value < 0 || in_width_value >= in_width)); \ + in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + +#undef READ_INPUT + + int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch; + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + } + } + } + +#ifdef FUSED_RELU + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); +#endif + + const int out_x_base = out_ch_blk * out_width; + int w = out_w_blk; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out0); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out1); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out2); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out3); + +} diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 56f2cedc5e1f2427fcea57b91b9150e049f618ba..bf3844679006dc5b594a919b2ffa86b324903fc7 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,151 +1,15 @@ #include -#define vec_conv_2d_1x1_s1 \ - VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr); \ - VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel); \ - VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel); \ - VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel); - - -#define vec_conv_2d_1x1_s2 \ - VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr); \ - VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4); \ - VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel); \ - VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4); \ - VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel); \ - VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\ - VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel); \ - VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \ - VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02); \ - VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02); \ - VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02); \ - VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02); - - -#define vec_conv_2d_1x1_compute_loop \ - for (int oc = 0; oc < 4; ++oc) { \ - VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \ - VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel); \ - out += in0 * weights.x; \ - out += in1 * weights.y; \ - out += in2 * weights.z; \ - out += in3 * weights.w; \ - vstore4(out, 0, output_ptr + oc * out_pixel); \ - } - -#define vec_conv_2d_1x1_compute \ - VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \ - VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr); \ - out += in0 * weights.x; \ - out += in1 * weights.y; \ - out += in2 * weights.z; \ - out += in3 * weights.w; \ - vstore4(out, 0, output_ptr); - -// Supported data type: half/float -__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */ - __global const DATA_TYPE *filter, /* o, i, kh, kw */ -#ifdef BIAS - __global const DATA_TYPE *bias, /* o */ -#endif /* defined(BIAS) */ - __global DATA_TYPE *output, /* n, c, h, w */ - __private const int in_chan_num, - __private const int out_chan_num, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; - - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - -#ifdef STRIDE_1 - const int stride = 1; -#else - const int stride = 2; -#endif - const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; - - const int in_offset = batch * in_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - - const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; - DATA_TYPE *output_base = output + out_offset + out_pixel_begin; - - int out_chan_len = out_chan_end - out_chan_begin; - int pixel_len = out_pixel_end - out_pixel_begin; - - for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { - DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; -#ifdef BIAS - DATA_TYPE bias_value = bias[out_chan]; -#else - DATA_TYPE bias_value = 0; -#endif - for (int p = 0; p < pixel_len; ++p) { - output_ptr[p] = bias_value; - } - } - - int in_chan = 0; - if (pixel_len == 4) { - for (; in_chan + 3 < in_chan_num; in_chan += 4) { - const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel; - int out_chan = out_chan_begin; - for (; out_chan + 3 < out_chan_end; out_chan += 4) { - const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan; - DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; -#ifdef STRIDE_1 - vec_conv_2d_1x1_s1; -#else - vec_conv_2d_1x1_s2; -#endif - vec_conv_2d_1x1_compute_loop; - } - for (; out_chan < out_chan_end; ++out_chan) { - const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan; - DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; -#ifdef STRIDE_1 - vec_conv_2d_1x1_s1; -#else - vec_conv_2d_1x1_s2; -#endif - vec_conv_2d_1x1_compute; - } - } - } - - for (; in_chan < in_chan_num; ++in_chan) { - const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel; - for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { - DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan]; - DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; - - for (int p = 0; p < pixel_len; ++p) { - float in = input_ptr[p*stride]; - output_ptr[p] += in * weights; - } - } - } -} - __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ +#ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif __write_only image2d_t output, + __private const int in_height, + __private const int in_width, __private const int in_ch_blks, + __private const int height, __private const int width) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); @@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0)); - half4 out[4]; - out[0] = (half4)(bias_value.x); - out[1] = (half4)(bias_value.y); - out[2] = (half4)(bias_value.z); - out[3] = (half4)(bias_value.w); - - int w[4]; - w[0] = out_w_blk; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; - - // Unrolling this loop hurt perfmance - int in_x_base = 0; - for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - half4 in[4]; - in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb)); - if (w[1] < width) { - // conditional load hurt perf, this branching helps sometimes - in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb)); - in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb)); - in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb)); - } - - // The order matters, load input first then load filter, why? - const int filter_x0 = in_ch_blk << 2; - half4 weights[4]; - #pragma unroll - for (int c = 0; c < 4; ++c) { - weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk)); - } - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - #pragma unroll - for (int c = 0; c < 4; ++c) { - out[c] += in[c].x * weights[0]; - out[c] += in[c].y * weights[1]; - out[c] += in[c].z * weights[2]; - out[c] += in[c].w * weights[3]; - } - - in_x_base += width; - } - - const int out_x_base = out_ch_blk * width; - write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]); - - if (w[1] >= width) return; - write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]); - - if (w[2] >= width) return; - write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]); - - if (w[3] >= width) return; - write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]); -} - -__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */ - __read_only image2d_t filter, /* cout%8 * cin, cout/8 */ - __read_only image2d_t bias, /* cout%8 * cout/8 */ - __write_only image2d_t output, - __private const int in_ch_blks, - __private const int width) { - const int out_ch_blk = get_global_id(0); - const int out_w_blk = get_global_id(1); - const int out_w_blks = get_global_size(1); - const int out_hb = get_global_id(2); +#ifdef BIAS + DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int4 w; +#if STRIDE == 1 + w.x = out_w_blk; + w.y = w.x + out_w_blks; + w.z = w.y + out_w_blks; + w.w = w.z + out_w_blks; + int out_hb_idx = (out_hb % height); +#else + w.x = out_w_blk * 2; + w.y = (out_w_blk + out_w_blks) * 2; + w.z = (out_w_blk + 2 * out_w_blks) * 2; + w.w = (out_w_blk + 3 * out_w_blks) * 2; + int out_hb_idx = (out_hb % height) * 2; +#endif - float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0)); - half4 bias_value03 = as_half4(bias_value.xy); - half4 bias_value47 = as_half4(bias_value.zw); - half4 out[8]; - out[0] = (half4)(bias_value03.x); - out[1] = (half4)(bias_value03.y); - out[2] = (half4)(bias_value03.z); - out[3] = (half4)(bias_value03.w); - out[4] = (half4)(bias_value47.x); - out[5] = (half4)(bias_value47.y); - out[6] = (half4)(bias_value47.z); - out[7] = (half4)(bias_value47.w); + w.x = select(w.x, INT_MIN, w.x >= in_width); + w.y = select(w.y, INT_MIN, w.y >= in_width); + w.z = select(w.z, INT_MIN, w.z >= in_width); + w.w = select(w.w, INT_MIN, w.w >= in_width); - int w[4]; - w[0] = out_w_blk; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; + out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height, + -1, + out_hb_idx >= in_height); // Unrolling this loop hurt perfmance int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - half4 in[8]; - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb)); - in[wi << 1] = as_half4(in_value.xy); - in[wi << 1 + 1] = as_half4(in_value.zw); - } - // The order matters, load input first then load filter, why? + DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); + DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); + DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); + DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); + const int filter_x0 = in_ch_blk << 2; - half4 weights[8]; - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk)); - weights[wi << 1] = as_half4(weights_value.xy); - weights[wi << 1 + 1] = as_half4(weights_value.zw); - } + DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); + DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); + DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); + DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? - // Interleaving load and mul does not improve performance as expected - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - int idx = wi << 1; - out[idx] += in[idx].x * weights[0]; - out[idx] += in[idx].y * weights[1]; - out[idx] += in[idx].z * weights[2]; - out[idx] += in[idx].w * weights[3]; + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; - ++idx; - out[idx] += in[idx].x * weights[4]; - out[idx] += in[idx].y * weights[5]; - out[idx] += in[idx].z * weights[6]; - out[idx] += in[idx].w * weights[7]; - } + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; - in_x_base += width; + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + in_x_base += in_width; } +#ifdef FUSED_RELU + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); +#endif + const int out_x_base = out_ch_blk * width; - float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1])); - write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + int out_x_idx = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); + + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1); - if (w[1] >= width) return; - out_value = (float4)(as_float2(out[2]), as_float2(out[3])); - write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2); - if (w[2] >= width) return; - out_value = (float4)(as_float2(out[4]), as_float2(out[5])); - write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); - if (w[3] >= width) return; - out_value = (float4)(as_float2(out[6]), as_float2(out[7])); - write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 33d7305b6e8ebb77d97071616fa5dfa9eb7c3a5d..08bf04d3c883e12f5970cd82a9394620b2649e51 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __write_only image2d_t output, __private const int in_height, __private const int in_width, - __private const int in_channels, + __private const int in_ch_blks, __private const int out_height, __private const int out_width, __private const int padding_top, @@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_w_blk = get_global_id(1); const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); - const int in_ch_blks = (in_channels + 3) / 4; const int rounded_in_ch = in_ch_blks * 4; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0}; #ifdef BIAS - out[0] = - CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); - out[1] = out[0]; - out[2] = out[0]; - out[3] = out[0]; + DATA_TYPE4 out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; + DATA_TYPE4 out4 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; + DATA_TYPE4 out4 = 0; +#endif + +#if STRIDE == 1 + int in_width0 = out_w_blk - padding_left; + int in_width1 = in_width0 + out_w_blks; + int in_width2 = in_width1 + out_w_blks; + int in_width3 = in_width2 + out_w_blks; + int in_width4 = in_width3 + out_w_blks; + const int height_idx = (out_hb % out_height) - padding_top; +#else + int in_width0 = out_w_blk * 2 - padding_left; + int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; + int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; + int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; + int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left; + const int height_idx = (out_hb % out_height) * 2 - padding_top; #endif - int w[4]; - w[0] = out_w_blk - padding_left; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; - - const int batch_idx = out_hb / out_height; - const int height_idx = out_hb % out_height; - int in_hb[3]; - in_hb[0] = height_idx - padding_top; - in_hb[1] = in_hb[0] + 1; - in_hb[2] = in_hb[1] + 1; - // Judge the height border for padding input. - in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height; - in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height; - in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height; - - const int input_image_width = in_ch_blks * in_width; + const int batch_idx = (out_hb / out_height) * in_height; + DATA_TYPE4 in0, in1, in2, in3, in4; + DATA_TYPE4 weights0, weights1, weights2, weights3; + int in_idx, hb_idx, width_idx, in_width_idx; // Unrolling this loop hurt perfmance - int idx = 0; - for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - VEC_DATA_TYPE(DATA_TYPE, 4) in[36]; - VEC_DATA_TYPE(DATA_TYPE, 4) weights[36]; - - int filter_idx = in_ch_blk << 2; - int in_idx = in_ch_blk * in_width; - - #pragma unroll - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 3; ++j) { - idx = i * 12 + j * 4; - int in_width_idx = w[0] + j; - // Judge the width border for padding input. - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 0] = 0; - } else { - in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[1] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 1] = 0; - } else { - in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[2] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 2] = 0; - } else { - in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[3] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 3] = 0; - } else { - in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - - weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); - - filter_idx += rounded_in_ch; - } - } - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - #pragma unroll - for (int c = 0; c < 4; ++c) { - for (int i = 0; i < 9; ++i) { - out[c] += in[c + i * 4].x * weights[0 + i * 4]; - out[c] += in[c + i * 4].y * weights[1 + i * 4]; - out[c] += in[c + i * 4].z * weights[2 + i * 4]; - out[c] += in[c + i * 4].w * weights[3 + i * 4]; + for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); + for (short width_idx = 0; width_idx < 3; ++width_idx) { + + in_idx = in_ch_blk * in_width; + int in_width_value; +#define READ_INPUT(i) \ + in_width_value = in_width##i + width_idx; \ + in_width_value = select(in_idx + in_width_value, \ + -1, \ + (in_width_value < 0 || in_width_value >= in_width)); \ + in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + READ_INPUT(4); + +#undef READ_INPUT + + int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + out4 += in4.x * weights0; + out4 += in4.y * weights1; + out4 += in4.z * weights2; + out4 += in4.w * weights3; } } } +#ifdef FUSED_RELU + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); + out4 = fmax(out4, 0); +#endif + const int out_x_base = out_ch_blk * out_width; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[0] + padding_left, out_hb), - out[0]); - - w[1] += padding_left; - if (w[1] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[1], out_hb), - out[1]); - - w[2] += padding_left; - if (w[2] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[2], out_hb), - out[2]); - - w[3] += padding_left; - if (w[3] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[3], out_hb), - out[3]); + int w = out_w_blk; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out0); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out1); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out2); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out3); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out4); + } diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index bc987dddb96ec5202fb3882fc279d4480c077f38..bd2763fc8c7eb68c06e09b4822ccc5025807a151 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -1,193 +1,87 @@ #include -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2); - VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr); - DATA_TYPE row01 = *(input_ptr + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width); - DATA_TYPE row11 = *(input_ptr + in_width + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2); - DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2); - - VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2); - - DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2); - return res; -} - -// Supported data type: half/float -__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); - - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; - - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; - DATA_TYPE *output_base = output + out_offset + out_pixel_begin; - - const int pixels = out_pixel_end - out_pixel_begin; - - for (int i = out_chan_begin; i < out_chan_end; ++i) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - if (pixels == 4) { - VEC_DATA_TYPE(DATA_TYPE,4) res; -#ifdef STRIDE_1 - res = vec_pooling_3_s1(input_ptr, in_width); +#ifdef FP16 +#define MIN_VALUE -USHRT_MAX #else - res = vec_pooling_3_s2(input_ptr, in_width); +#define MIN_VALUE -FLT_MAX #endif - vstore4(res, 0, output_ptr); - } else { - for (int p = 0; p < pixels; ++p) { - output_ptr[p] = inner_pooling_3(input_ptr, in_width); - input_ptr += stride; - } - } - } -} -int calculate_avg_block_size(const int pos_h, - const int pos_w, - const int pool_size, - const int pad_h, - const int pad_w, - const int h_size, - const int w_size) { - const int h_start = max(0, pos_h - pad_h); - const int w_start = max(0, pos_w - pad_w); - const int h_end = min(pos_h + pool_size - pad_h, h_size); - const int w_end = min(pos_w + pool_size - pad_w, w_size); +inline int calculate_avg_block_size(const int pool_size, + const int pos_h, + const int pos_w, + const int h_size, + const int w_size) { + const int h_start = max(0, pos_h); + const int w_start = max(0, pos_w); + const int h_end = min(pos_h + pool_size, h_size); + const int w_end = min(pos_w + pool_size, w_size); return (h_end - h_start) * (w_end - w_start); } // Supported data type: half/float -__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __private const int pad_h, - __private const int pad_w, - __private const int pooling_size, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_idx = get_global_id(1); - int out_pixel_idx = get_global_id(2); - - const int out_pixel_height = out_pixel_idx / out_width; - const int out_pixel_width = out_pixel_idx % out_width; - - const int out_chan_begin = out_chan_idx * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int in_pixel_idx = out_pixel_height * stride * in_width - + out_pixel_width * stride; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; - DATA_TYPE *output_base = output + out_offset + out_pixel_idx; - - const int block_size = calculate_avg_block_size( - out_pixel_height * stride, - out_pixel_width * stride, - pooling_size, - pad_h/2, - pad_w/2, - in_height - pad_h, - in_width - pad_w); - for (int i = out_chan_begin; i < out_chan_end; ++i) { - VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; - DATA_TYPE sum1 = 0.0f; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - for (int y = 0; y < pooling_size; ++y) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; - int x = 0; - for (; x < (pooling_size-8); x += 8) { - VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); - sum8 += data; - input_ptr += 8; - } - for (; x < pooling_size; ++x) { - sum1 += *input_ptr; - input_ptr++; +__kernel void pooling(__read_only image2d_t input, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int pad_top, + __private const int pad_left, + __private const int stride, + __private const int pooling_size, + __write_only image2d_t output) { + const int out_chan_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_width = get_global_size(1); + const int out_hb_idx = get_global_id(2); + + const int batch_idx = (out_hb_idx / out_height) * in_height; + const int in_height_start = (out_hb_idx % out_height) * stride - pad_top; + const int in_width_start = out_width_idx * stride - pad_left; + const int in_channel_offset = out_chan_idx * in_width; + + +#ifdef POOL_AVG + DATA_TYPE4 res = 0; + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = res + in; + } + } + const int block_size = calculate_avg_block_size(pooling_size, + in_height_start, in_width_start, + in_height, in_width); + res /= block_size; +#else + DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE); + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + if (in_height_idx != -1) { + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + if (in_width_idx != -1) { + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = fmax(res, in); + } } } - VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567; - VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23; - - *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size; } +#endif + + WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res); } diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index f34e63cbf07b1a360957fcf5eaf74661ec22b8c1..efb769d27b7ab7836d0681c2b84775047942805a 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,18 +1,19 @@ #include -// Supported data type: half/float -__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */ - __global DATA_TYPE *output /* n * c, h, w */, +__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __write_only image2d_t output, __private const float height_scale, __private const float width_scale, __private const int in_height, - __private const int in_width) { - const int c = get_global_id(0); - const int h = get_global_id(1); - const int w = get_global_id(2); - const int channels = get_global_size(0); - const int height = get_global_size(1); - const int width = get_global_size(2); + __private const int in_width, + __private const int out_height) { + const int ch_blk = get_global_id(0); + const int ch_blks = get_global_size(0); + const int w = get_global_id(1); + const int out_width = get_global_size(1); + const int hb = get_global_id(2); + const int b = hb / out_height; + const int h = hb % out_height; const float h_in = h * height_scale; const float w_in = w * width_scale; @@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, const float h_lerp = h_in - h_lower; const float w_lerp = w_in - w_lower; - const DATA_TYPE *input_base = input + c * in_height * in_width; - DATA_TYPE *output_base = output + c * height * width; + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const int in_w_offset = ch_blk * in_width; + const int in_h_offset = b * in_height; - DATA_TYPE top_left = input_base[h_lower * in_width + w_lower]; - DATA_TYPE top_right = input_base[h_lower * in_width + w_upper]; - DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower]; - DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper]; + DATA_TYPE4 top_left = READ_IMAGET(input, sampler, + (int2)(in_w_offset + w_lower, in_h_offset + h_lower)); + DATA_TYPE4 top_right = READ_IMAGET(input, sampler, + (int2)(in_w_offset + w_upper, in_h_offset + h_lower)); + DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler, + (int2)(in_w_offset + w_lower, in_h_offset + h_upper)); + DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler, + (int2)(in_w_offset + w_upper, in_h_offset + h_upper)); - const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp; - const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; - output_base[h * width + w] = top + (bottom - top) * h_lerp; + DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp; + DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; + + DATA_TYPE4 out = top + (bottom - top) * h_lerp; + + const int out_w_offset = ch_blk * out_width; + const int out_h_offset = b * out_height; + WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out); } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 528928e618abf37a0220ed1d9ebf6a5a7c602564..c40481543796215c80f4367e8e5f01a59b32c3be 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -9,50 +9,56 @@ namespace mace { namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, Tensor *output); extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, Tensor *output); -template <> -void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output) { +extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output); + +template +void Conv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, Tensor *output); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {nullptr, nullptr}, - {Conv2dOpenclK3x3S1, nullptr}, + {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, {nullptr, nullptr}, {nullptr, nullptr}}; index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + if (!input->is_image() || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; - // TODO(heliangliang) The CPU/NEON kernel should map the buffer - Conv2dFunctor(strides_, paddings_, dilations_)( - input, filter, bias, output); - return; + MACE_NOT_IMPLEMENTED; } std::vector output_shape(4); @@ -61,17 +67,24 @@ void Conv2dFunctor::operator()(const Tensor *input, input->shape().data(), filter->shape().data(), dilations_, strides_, paddings_, output_shape.data(), paddings.data()); - if (input->is_image()) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + + if (kernel_h == kernel_w && kernel_h <= 5 && + selector[kernel_h - 1][strides_[0] - 1] != nullptr) { + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum::value, output); } else { - output->Resize(output_shape); + Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum::value, output); } - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, paddings.data(), output); } +template +struct Conv2dFunctor; +template +struct Conv2dFunctor; + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 28f57f484a8e1b29acfefa6f021281f2030cab31..d759689c6dc1ee8ffbfa98f2a4a58577a50c4271 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -5,83 +5,44 @@ #include "mace/kernels/conv_2d.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/utils/utils.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -void Conv1x1V2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - Tensor *output) { +void Conv1x1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const bool fused_relu, + const int stride, + const DataType dt, + Tensor *output) { const index_t batch = output->dim(0); - const index_t channels = output->dim(1); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - const index_t input_channels = input->dim(1); - - auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - const index_t channel_blocks = (channels + 3) / 4; - const index_t pixel_blocks = (width + 3) / 4 * height; - - // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy - // TODO check wired clReleaseCommandQueue latency - // The KernelFunctor can cause segment faults in cb_retain_event - std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options); - - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, - *(static_cast(filter->buffer()))); - if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, - *(static_cast(bias->buffer()))); - } - conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); - conv_2d_kernel.setArg(idx++, static_cast(input_channels)); - conv_2d_kernel.setArg(idx++, static_cast(channels)); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(3))); - conv_2d_kernel.setArg(idx++, static_cast(height)); - conv_2d_kernel.setArg(idx++, static_cast(width)); - - auto command_queue = runtime->command_queue(); - cl_int error = command_queue.enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(static_cast(batch), static_cast(channel_blocks), - static_cast(pixel_blocks)), - cl::NDRange(1, 2, kwg_size / 2), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS, error); -} - -void Conv1x1V3(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - Tensor *output) { - const index_t batch = output->dim(0); - const index_t channels = output->dim(1); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - const index_t input_channels = input->dim(1); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); + MACE_CHECK(input_batch == batch); + std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace("-DSTRIDE_1"); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (bias != nullptr) { + built_options.emplace("-DBIAS"); + } + if (fused_relu) { + built_options.emplace("-DFUSED_RELU"); + } auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); @@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input, conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); } conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input_height)); + conv_2d_kernel.setArg(idx++, static_cast(input_width)); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); auto command_queue = runtime->command_queue(); cl_int error; error = command_queue.enqueueNDRangeKernel( conv_2d_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blocks), static_cast(height), + cl::NDRange(static_cast(channel_blocks), + static_cast(width_blocks), static_cast(height * batch)), - cl::NDRange(4, 15, 8), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + cl::NDRange(4, 15, 8), // TODO auto tuning + nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, + const bool fused_relu, const int *padding, + const DataType dt, Tensor *output) { - const index_t batch = output->dim(0); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - - const index_t input_batch = input->dim(0); - const index_t input_height = input->dim(2); - const index_t input_width = input->dim(3); - - MACE_CHECK(input_batch == batch && input_height == height && - input_width == width); - - Conv1x1V2(input, filter, bias, 1, output); + Conv1x1(input, filter, bias, fused_relu, 1, dt, output); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, + const bool fused_relu, const int *padding, + const DataType dt, Tensor *output) { - MACE_CHECK(input->dim(0) == output->dim(0)); - - Conv1x1V2(input, filter, bias, 2, output); + Conv1x1(input, filter, bias, fused_relu, 2, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index b7e11e817922287a9b048ed9299c5d332f3ef0cf..7b7453ad53a4af2921cdea07f9a983b51865d848 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -12,8 +12,9 @@ namespace mace { namespace kernels { static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, - const Tensor *bias, const uint32_t stride, - const int *padding, Tensor *output) { + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - const index_t width_blocks = RoundUpDiv4(width); + const index_t width_blocks = RoundUpDiv(width); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (fused_relu) { + built_options.emplace("-DFUSED_RELU"); + } auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); @@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); conv_2d_kernel.setArg(idx++, padding[0] / 2); @@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, conv_2d_kernel, cl::NullRange, cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)), - cl::NDRange(4, 15, 8), + cl::NDRange(16, 16, 4), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } -void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 1, padding, output); +void Conv2dOpenclK3x3S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const bool fused_relu, + const int *padding, + const DataType dt, + Tensor *output) { + Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output); }; -void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { +void Conv2dOpenclK3x3S2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const bool fused_relu, + const int *padding, + const DataType dt, + Tensor *output) { + Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..e46ecbcaca06e811de44b5a29e08abb1e3418906 --- /dev/null +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -0,0 +1,73 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/common.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +void Conv2dOpencl(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t input_channel_blocks = RoundUpDiv4(input_channels); + const index_t width_blocks = RoundUpDiv4(width); + + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (fused_relu) { + built_options.emplace("-DFUSED_RELU"); + } + + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + + auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + + uint32_t idx = 0; + conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + if (bias != nullptr) { + conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + } + conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); + conv_2d_kernel.setArg(idx++, static_cast(width)); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(0))); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(1))); + conv_2d_kernel.setArg(idx++, padding[0] / 2); + conv_2d_kernel.setArg(idx++, padding[1] / 2); + + auto command_queue = runtime->command_queue(); + cl_int error; + error = command_queue.enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), + static_cast(height * batch)), + cl::NDRange(16, 16, 4), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + MACE_CHECK(error == CL_SUCCESS, error); + +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc index 60ce2a829a78a0a0439dd1e287c61f2dee4b490b..1402131df164cb0d1ba348617b3988e78f71c574 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc @@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options); diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e75cb9d7e369b57eb9caa0125a42f6d8b539c50 --- /dev/null +++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc @@ -0,0 +1,87 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/fused_conv_2d.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { + +extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, + Tensor *output); + +template +void FusedConv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const int *padding, const DataType dt, + Tensor *output); + // Selection matrix: kernel_size x stride_size + static const Conv2dOpenclFunction selector[5][2] = { + {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, + {nullptr, nullptr}, + {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, + {nullptr, nullptr}, + {nullptr, nullptr}}; + + index_t kernel_h = filter->dim(0); + index_t kernel_w = filter->dim(1); + if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || + selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + LOG(WARNING) << "OpenCL conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides_[0] << "x" << strides_[1] + << " is not implemented yet, using slow version"; + // TODO(heliangliang) The CPU/NEON kernel should map the buffer + FusedConv2dFunctor(strides_, paddings_, dilations_)( + input, filter, bias, output); + return; + } + + std::vector output_shape(4); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations_, + strides_, paddings_, output_shape.data(), paddings.data()); + + if (input->is_image()) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + } else { + output->Resize(output_shape); + } + + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum::value, output); +} + +template +struct FusedConv2dFunctor; +template +struct FusedConv2dFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 05221e55dedde3c7cc17d3f99d2818491d930b87..2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ } -std::string DataTypeToCLType(const DataType dt) { +std::string DtToCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: return "float"; case DT_HALF: return "half"; - case DT_UINT8: - return "uchar"; - case DT_INT8: - return "char"; - case DT_DOUBLE: - return "double"; - case DT_INT32: - return "int"; - case DT_UINT32: - return "int"; - case DT_UINT16: - return "ushort"; - case DT_INT16: - return "short"; - case DT_INT64: - return "long"; default: LOG(FATAL) << "Unsupported data type"; return ""; } } -std::string DataTypeToOPENCLCMDDataType(const DataType dt) { +std::string DtToCLCMDDt(const DataType dt) { switch (dt) { case DT_FLOAT: return "f"; @@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) { } } +std::string DtToUpstreamCLDt(const DataType dt) { + switch (dt) { + case DT_FLOAT: + case DT_HALF: + return "float"; + default: + LOG(FATAL) << "Unsupported data type"; + return ""; + } +} + +std::string DtToUpstreamCLCMDDt(const DataType dt) { + switch (dt) { + case DT_FLOAT: + case DT_HALF: + return "f"; + default: + LOG(FATAL) << "Not supported data type for opencl cmd data type"; + return ""; + } +} + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 1ad94aa5d2545f059ec785c0b4ec36a87155fb49..70d74e5886c61a50c0a5fb684d02ecc6e00403cd 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, std::vector &image_shape); -std::string DataTypeToOPENCLCMDDataType(const DataType dt); +std::string DtToCLCMDDt(const DataType dt); -std::string DataTypeToCLType(const DataType dt); +std::string DtToUpstreamCLCMDDt(const DataType dt); +std::string DtToCLDt(const DataType dt); + +std::string DtToUpstreamCLDt(const DataType dt); } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 0aaa89ae2c649583dddafaffbcce428d4ffc94fd..349c619574e425aea00b4521194c3ae04649942f 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -10,131 +10,94 @@ namespace mace { namespace kernels { -static void Pooling3(const Tensor *input, - const int *stride, - const PoolingType type, - Tensor *output) { - if (type != MAX) { - MACE_NOT_IMPLEMENTED; - } +static void Pooling(const Tensor *input, + const int *stride, + const int *paddings, + const int pooling_size, + const PoolingType type, + const DataType dt, + Tensor *output) { index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); - index_t channel_blk = (channels + 3) / 4; - const index_t pixel_width = (out_width + 3) / 4 ; + index_t channel_blocks = (channels + 3) / 4; const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(pixel_width * out_height), + static_cast(channel_blocks), + static_cast(out_width), + static_cast(batch * out_height), }; auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); - auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); + if (type == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - const uint32_t lws[3] = {1, 8, 128}; + uint32_t lws[3]; + lws[0] = std::min(channel_blocks, kwg_size); + lws[1] = std::min(out_width, kwg_size / lws[0]); + lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, static_cast(input->dim(1))); pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); + pooling_kernel.setArg(idx++, paddings[0] / 2); + pooling_kernel.setArg(idx++, paddings[1] / 2); pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + pooling_kernel.setArg(idx++, pooling_size); + pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); + MACE_CHECK(error == CL_SUCCESS) << error; } -static void PoolingN(const Tensor *input, - const int *stride, - const int *paddings, - const int pooling_size, - const PoolingType type, - Tensor *output) { - if (type != AVG) { - MACE_NOT_IMPLEMENTED; - } - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); - - index_t channel_blk = (channels + 3) / 4; - const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(out_height * out_width), +template +void PoolingFunctor::operator()(const Tensor *input, + Tensor *output) { + MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input->dim(3), input->dim(3) }; - auto runtime = OpenCLRuntime::Get(); - std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); - const uint32_t lws[3] = {1, 8, 128}; + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); - uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); - pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); - pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); - pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, paddings[0]); - pooling_kernel.setArg(idx++, paddings[1]); - pooling_kernel.setArg(idx++, pooling_size); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, + DataTypeToEnum::value, output); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); -} - -template <> -void PoolingFunctor::operator()(const Tensor *input, - Tensor *output) { - int paddings[2]; - std::vector filter_shape = {input->dim(1), input->dim(0), - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); -#define POOLING_HELPER \ - switch(kernels_[0]) { \ - case 3: \ - Pooling3(input, strides_, pooling_type_, output); \ - break; \ - default: \ - PoolingN(input, strides_, paddings, kernels_[0], \ - pooling_type_, output); \ - break; \ - } - - if (paddings[0] > 0 || paddings[1] > 0) { - Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v()); - ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX); - input = &padded_input; - POOLING_HELPER - } else { - POOLING_HELPER - } -#undef POOLING_HELPER } +template +struct PoolingFunctor; +template +struct PoolingFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 7b77afea0fdd3aed146b22d736cacc5c6c165e79..27dd8e62b96422c368e324d249900b5e8d5f7767 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -6,24 +6,33 @@ #include "mace/core/tensor.h" #include "mace/kernels/resize_bilinear.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -template <> -void ResizeBilinearFunctor::operator()( +template +void ResizeBilinearFunctor::operator()( const Tensor *input, const Tensor *resize_dims, Tensor *output) { const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t in_height = input->dim(2); - const index_t in_width = input->dim(3); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); index_t out_height; index_t out_width; GetOutputSize(resize_dims, &out_height, &out_width); MACE_CHECK(out_height > 0 && out_width > 0); - std::vector out_shape {batch, channels, out_height, out_width}; - output->Resize(out_shape); + std::vector output_shape {batch, out_height, out_width, channels}; + if (input->is_image()) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + } else { + output->Resize(output_shape); + } float height_scale = CalculateResizeScale(in_height, out_height, align_corners_); @@ -31,29 +40,37 @@ void ResizeBilinearFunctor::operator()( auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); + uint32_t idx = 0; - rb_kernel.setArg(idx++, *(static_cast(input->buffer()))); - rb_kernel.setArg(idx++, *(static_cast(output->buffer()))); + rb_kernel.setArg(idx++, *(static_cast(input->buffer()))); + rb_kernel.setArg(idx++, *(static_cast(output->buffer()))); rb_kernel.setArg(idx++, height_scale); rb_kernel.setArg(idx++, width_scale); - rb_kernel.setArg(idx++, static_cast(in_height)); - rb_kernel.setArg(idx++, static_cast(in_width)); + rb_kernel.setArg(idx++, static_cast(in_height)); + rb_kernel.setArg(idx++, static_cast(in_width)); + rb_kernel.setArg(idx++, static_cast(out_height)); auto command_queue = runtime->command_queue(); cl_int error = command_queue.enqueueNDRangeKernel( rb_kernel, cl::NullRange, - cl::NDRange(static_cast(batch * channels), - static_cast(out_height), static_cast(out_width)), - // TODO (heliangliang) tuning and fix when kwg_size < devisor - cl::NDRange(1, 16, kwg_size / 16), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + cl::NDRange(static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)), + // TODO tuning + cl::NDRange(1, static_cast(out_width > kwg_size ? kwg_size : out_width), 1), + nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } +template struct ResizeBilinearFunctor; +template struct ResizeBilinearFunctor; + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc index 2716501c880fcd4fb2232e292b9396e27cfff2f3..72590be5e87ca1c5b721972855b8869e397df82c 100644 --- a/mace/kernels/opencl/space_to_batch_opecl.cc +++ b/mace/kernels/opencl/space_to_batch_opecl.cc @@ -20,7 +20,7 @@ void SpaceToBatchFunctor::operator()(Tensor *space_te Tensor *batch_tensor) { auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype())); auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options); uint32_t idx = 0; diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 11c05e47c5eb0faef5b3febe3ee63c4d1864d5c6..0a1960a4e7e891d6c71d1841cf672b3a48a83fdb 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -18,36 +18,66 @@ enum PoolingType { namespace kernels { -template -struct PoolingFunctor { - PoolingFunctor(const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding, - const int *dilations) +struct PoolingFunctorBase { + PoolingFunctorBase(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) : pooling_type_(pooling_type), kernels_(kernels), strides_(strides), padding_(padding), dilations_(dilations) {} + const PoolingType pooling_type_; + const int *kernels_; + const int *strides_; + const Padding padding_; + const int *dilations_; +}; + +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, Tensor *output_tensor) { + + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input_tensor->dim(3), input_tensor->dim(3) + }; + + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); + output_tensor->Resize(output_shape); + Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard out_guard(output_tensor); const T *input = input_tensor->data(); T *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; + index_t height = output_shape[1]; + index_t width = output_shape[2]; + index_t channels = output_shape[3]; index_t out_image_size = height * width; - index_t input_channels = input_shape[1]; - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; + index_t input_height = input_shape[1]; + index_t input_width = input_shape[2]; + index_t input_channels = input_shape[3]; index_t in_image_size = input_height * input_width; int kernel_h = kernels_[0]; @@ -59,11 +89,6 @@ struct PoolingFunctor { int dilation_h = dilations_[0]; int dilation_w = dilations_[1]; - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); // The left-upper most offset of the padded input int padded_h_start = 0 - paddings[0] / 2; int padded_w_start = 0 - paddings[1] / 2; @@ -71,25 +96,24 @@ struct PoolingFunctor { if (pooling_type_ == MAX) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - T max = std::numeric_limits::lowest(); + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; + T res = std::numeric_limits::lowest(); for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; - max = std::max(max, input[input_offset]); + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; + res = std::max(res, input[input_offset]); } } } - output[out_offset] = max; - out_offset += 1; + *output = res; + output++; } } } @@ -97,11 +121,10 @@ struct PoolingFunctor { } else if (pooling_type_ == AVG) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; T sum = 0; int block_size = 0; for (int kh = 0; kh < kernel_h; ++kh) { @@ -110,14 +133,14 @@ struct PoolingFunctor { int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; sum += input[input_offset]; block_size += 1; } } } - output[out_offset] = sum / block_size; - out_offset += 1; + *output = sum / block_size; + output++; } } } @@ -125,22 +148,26 @@ struct PoolingFunctor { } } - const PoolingType pooling_type_; - const int *kernels_; - const int *strides_; - const Padding padding_; - const int *dilations_; }; -template <> +template<> void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor); -template <> -void PoolingFunctor::operator()( - const Tensor *input_tensor, - Tensor *output_tensor); +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, + Tensor *output_tensor); +}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 59bb2505c9c379c1b0700d7a515a880a704d72db..27415ebdd8e61ff904360d1c520aab8ecf2b7591 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -61,63 +61,90 @@ void ResizeImage(const T *images, const index_t channels, const std::vector &xs_vec, const std::vector &ys, - float *output) { - const index_t in_channel_size = in_height * in_width; - const index_t in_batch_num_values = channels * in_channel_size; - const index_t out_channel_size = out_height * out_width; - const index_t out_batch_num_values = channels * out_channel_size; + T *output) { + const index_t in_batch_num_values = channels * in_height * in_width; + const index_t out_batch_num_values = channels * out_height * out_width; const CachedInterpolation *xs = xs_vec.data(); -#pragma omp parallel for collapse(2) +#pragma omp parallel for for (index_t b = 0; b < batch_size; ++b) { - for (index_t c = 0; c < channels; ++c) { - const T *input_ptr = - images + in_batch_num_values * b + in_channel_size * c; - float *output_ptr = - output + out_batch_num_values * b + out_channel_size * c; - for (index_t y = 0; y < out_height; ++y) { - const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width; - const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width; - const float ys_lerp = ys[y].lerp; - for (index_t x = 0; x < out_width; ++x) { - auto xs_lower = xs[x].lower; - auto xs_upper = xs[x].upper; - auto xs_lerp = xs[x].lerp; - - const float top_left = ys_input_lower_ptr[xs_lower]; - const float top_right = ys_input_lower_ptr[xs_upper]; - const float bottom_left = ys_input_upper_ptr[xs_lower]; - const float bottom_right = ys_input_upper_ptr[xs_upper]; - - output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left, - bottom_right, xs_lerp, ys_lerp); + const T *batch_input_ptr = images + in_batch_num_values * b;; + T *batch_output_ptr = output + out_batch_num_values * b; + + for (index_t y = 0; y < out_height; ++y) { + const T *y_lower_input_ptr = + batch_input_ptr + ys[y].lower * in_width * channels; + const T *y_upper_input_ptr = + batch_input_ptr + ys[y].upper * in_width * channels; + T *y_output_ptr = batch_output_ptr + y * out_width * channels; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels; + const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels; + const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels; + const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels; + T *output_ptr = y_output_ptr + x * channels; + + for (index_t c = 0; c < channels; ++c) { + const T top_left = top_left_ptr[c]; + const T top_right = top_right_ptr[c]; + const T bottom_left = bottom_left_ptr[c]; + const T bottom_right = bottom_right_ptr[c]; + + output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left, + bottom_right, xs_lerp, ys_lerp); } - output_ptr += out_width; } } } } } +struct ResizeBilinearFunctorBase { + ResizeBilinearFunctorBase(const std::vector &size, + bool align_corners) + : align_corners_(align_corners), size_(size) {} + + protected: + void GetOutputSize(const Tensor *resize_dims, + index_t *out_height, + index_t *out_width) { + if (size_[0] < 0 || size_[1] < 0) { + MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1); + Tensor::MappingGuard resize_dims_mapper(resize_dims); + auto dims_data = resize_dims->data(); + *out_height = dims_data[0]; + *out_width = dims_data[1]; + } else { + *out_height = size_[0]; + *out_width = size_[1]; + } + } + + bool align_corners_; + std::vector size_; +}; + template -class ResizeBilinearFunctor { - public: +struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ResizeBilinearFunctor(const std::vector &size, bool align_corners) - : align_corners_(align_corners), size_(size) {} + : ResizeBilinearFunctorBase(size, align_corners) {} void operator()(const Tensor *input, const Tensor *resize_dims, Tensor *output) { const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t in_height = input->dim(2); - const index_t in_width = input->dim(3); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); index_t out_height; index_t out_width; GetOutputSize(resize_dims, &out_height, &out_width); MACE_CHECK(out_height > 0 && out_width > 0); - std::vector out_shape{batch, channels, out_height, out_width}; + std::vector out_shape{batch, out_height, out_width, channels}; output->Resize(out_shape); Tensor::MappingGuard input_mapper(input); @@ -146,32 +173,18 @@ class ResizeBilinearFunctor { ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, channels, xs, ys, output_data); } +}; - protected: - void GetOutputSize(const Tensor *resize_dims, - index_t *out_height, - index_t *out_width) { - if (size_[0] < 0 || size_[1] < 0) { - MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1); - Tensor::MappingGuard resize_dims_mapper(resize_dims); - auto dims_data = resize_dims->data(); - *out_height = dims_data[0]; - *out_width = dims_data[1]; - } else { - *out_height = size_[0]; - *out_width = size_[1]; - } - } +template +struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { + ResizeBilinearFunctor(const std::vector &size, bool align_corners) + : ResizeBilinearFunctorBase(size, align_corners) {} - private: - bool align_corners_; - std::vector size_; + void operator()(const Tensor *input, + const Tensor *resize_dims, + Tensor *output); }; -template <> -void ResizeBilinearFunctor::operator()( - const Tensor *input, const Tensor *resize_dims, Tensor *output); - } // namespace kernels } // namespace mace diff --git a/mace/mace.bzl b/mace/mace.bzl index f9e7b6afc50d2908eef34292f522a0f3c4946c75..757334a8b8c0d5b104afd19bd9654ddec24b3eeb 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -22,4 +22,10 @@ def if_android_arm64(a): return select({ "//mace:android_arm64": a, "//conditions:default": [], - }) \ No newline at end of file + }) + +def if_profiling(a): + return select({ + "//mace:is_profiling": a, + "//conditions:default": [], + }) diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index b4b74b04b84d01ac4f6941c649acabc04f25c0d8..ba0bb38019fbfc6274d09dfa81d9efd8e83ed789 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -6,12 +6,26 @@ namespace mace { -REGISTER_CPU_OPERATOR(AddN, AddNOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN") + .TypeConstraint("T") + .Build(), + AddNOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(AddN, AddNOp); +REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN") + .TypeConstraint("T") + .Build(), + AddNOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(AddN, AddNOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN") + .TypeConstraint("T") + .Build(), + AddNOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN") + .TypeConstraint("T") + .Build(), + AddNOp); } // namespace mace diff --git a/mace/ops/addn.h b/mace/ops/addn.h index a2ffefbbc54e846317415e653078706a2938f67b..155c6830b6aa14e072e3ba67f68ee6421aa427c1 100644 --- a/mace/ops/addn.h +++ b/mace/ops/addn.h @@ -10,7 +10,7 @@ namespace mace { -template +template class AddNOp : public Operator { public: AddNOp(const OperatorDef &operator_def, Workspace *ws) @@ -18,7 +18,6 @@ class AddNOp : public Operator { bool Run() override { Tensor *output_tensor = this->outputs_[0]; - output_tensor->ResizeLike(this->inputs_[0]); int n = this->inputs_.size(); vector inputs(n, nullptr); for (int i = 0; i < n; ++i) { diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index ad48f4458e570f826b8d9caaf5c75f45d74dbaa1..717be1ea886e933a29b151276f6c653c2177cb3c 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -9,47 +9,69 @@ namespace mace { template -static void AddNBenchmark(int iters, int n, int size) { +static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder op_def_builder("AddN", "AddNBM"); - for (int i = 0; i < n; ++i) { - op_def_builder.Input(internal::MakeString("Input", i).c_str()); + // Add input data + for (int i = 0; i < inputs; ++i) { + net.AddRandomInput( + internal::MakeString("Input", i).c_str(), {n, h, w, c}); } - op_def_builder.Output("Output").Finalize(net.NewOperatorDef()); - // Add input data - for (int i = 0; i < n; ++i) { - net.AddRandomInput(internal::MakeString("Input", i).c_str(), {size}); + if (D == DeviceType::OPENCL) { + for (int i = 0; i < inputs; ++i) { + BufferToImage(net, internal::MakeString("Input", i).c_str(), + internal::MakeString("InputImage", i).c_str(), + kernels::BufferType::IN_OUT); + } + OpDefBuilder op_def_builder("AddN", "AddNBM"); + for (int i = 0; i < inputs; ++i) { + op_def_builder.Input(internal::MakeString("InputImage", i).c_str()); + } + op_def_builder.Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder op_def_builder("AddN", "AddNBM"); + for (int i = 0; i < inputs; ++i) { + op_def_builder.Input(internal::MakeString("Input", i).c_str()); + } + op_def_builder.Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); } // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); + net.Sync(); } mace::testing::StartTiming(); while (iters--) { net.RunOp(D); + net.Sync(); } } -#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \ - static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \ - const int64_t tot = static_cast(iters) * N * SIZE; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - AddNBenchmark(iters, N, SIZE); \ - } \ - BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE) - -#define BM_ADDN(N, SIZE, TYPE) \ - BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \ - BM_ADDN_MACRO(N, SIZE, TYPE, NEON); - -BM_ADDN(10, 1000, float); -BM_ADDN(10, 10000, float); -BM_ADDN(100, 1000, float); -BM_ADDN(100, 10000, float); -} // namespace mace \ No newline at end of file +#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ + static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + AddNBenchmark(iters, INPUTS, N, H, W, C); \ + } \ + BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define BM_ADDN(INPUTS, N, H, W, C, TYPE) \ + BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \ + BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL); + +BM_ADDN(2, 1, 240, 240, 256, float); +// BM_ADDN(2, 1, 240, 240, 256, half); +BM_ADDN(4, 1, 240, 240, 256, float); +// BM_ADDN(4, 1, 240, 240, 256, half); + +} // namespace mace diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 3fc58011f623ebf5ff541c1ed2f48d2b9eb5a959..5f9bd2bfe7cce685eca883e6c2159312ca0dd41f 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -9,7 +9,7 @@ namespace mace { class AddnOpTest : public OpsTestBase {}; -template +template void SimpleAdd2() { // Construct graph OpsTestNet net; @@ -20,30 +20,26 @@ void SimpleAdd2() { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); - net.AddInputFromArray("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); + net.AddInputFromArray("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}); + net.AddInputFromArray("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}); // Run net.RunOp(D); - auto expected = CreateTensor({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}); + auto expected = CreateTensor({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } -TEST_F(AddnOpTest, CPUSimpleAdd2) { - SimpleAdd2(); -} +TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2(); } -TEST_F(AddnOpTest, NEONSimpleAdd2) { - SimpleAdd2(); -} +/* +TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2(); } -TEST_F(AddnOpTest, OPENCLSimpleAdd2) { - SimpleAdd2(); -} +TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2(); } +*/ -template +template void SimpleAdd3() { // Construct graph OpsTestNet net; @@ -55,62 +51,80 @@ void SimpleAdd3() { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); - net.AddInputFromArray("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); - net.AddInputFromArray("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); + net.AddInputFromArray("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}); + net.AddInputFromArray("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}); + net.AddInputFromArray("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}); // Run net.RunOp(D); - auto expected = CreateTensor({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18}); + auto expected = CreateTensor({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } -TEST_F(AddnOpTest, CPUSimpleAdd3) { - SimpleAdd3(); -} +TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3(); } -TEST_F(AddnOpTest, NEONSimpleAdd3) { - SimpleAdd3(); -} +/* +TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3(); } +*/ -template +template void RandomTest() { - // Construct graph - OpsTestNet net; - OpDefBuilder("AddN", "AddNTest") - .Input("Input1") - .Input("Input2") - .Output("Output") - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddRandomInput("Input1", {1, 2, 3, 4}); - net.AddRandomInput("Input2", {1, 2, 3, 4}); - - // Check - net.RunOp(D); - - Tensor result; - result.Copy(*net.GetOutput("Output")); - - // Run - net.RunOp(); - - ExpectTensorNear(*net.GetOutput("Output"), result, 1e-5); -} - -TEST_F(AddnOpTest, CPURandom) { - RandomTest(); + testing::internal::LogToStderr(); + srand(time(NULL)); + + for (int round = 0; round < 10; ++round) { + // generate random input + index_t n = 1 + (rand() % 5); + index_t h = 1 + (rand() % 100); + index_t w = 1 + (rand() % 100); + index_t c = 1 + (rand() % 32); + int input_num = 2 + rand() % 3; + // Construct graph + OpsTestNet net; + auto op_def = OpDefBuilder("AddN", "AddNTest"); + for (int i = 0; i < input_num; ++i) { + op_def.Input("Input" + ToString(i)); + } + op_def.Output("Output").Finalize(net.NewOperatorDef()); + + // Add input data + for (int i = 0; i < input_num; ++i) { + net.AddRandomInput("Input" + ToString(i), {n, h, w, c}); + } + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + for (int i = 0; i < input_num; ++i) { + BufferToImage(net, "Input" + ToString(i), + "InputImage" + ToString(i), + kernels::BufferType::IN_OUT); + } + + auto op_def_cl = OpDefBuilder("AddN", "AddNTest"); + for (int i = 0; i < input_num; ++i) { + op_def_cl.Input("InputImage" + ToString(i)); + } + op_def_cl.Output("OutputImage") + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); + + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", + kernels::BufferType::IN_OUT); + + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.1); + } } -TEST_F(AddnOpTest, NEONRandom) { - RandomTest(); -} - -TEST_F(AddnOpTest, OPENCLRandom) { - RandomTest(); -} +TEST_F(AddnOpTest, OPENCLRandom) { RandomTest(); } } // namespace mace diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 1ce9b1e090bbf171bbe3ff33c07512af12e94c80..76723b2dc2c369257b79fb66b8c472752253700d 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -6,12 +6,26 @@ namespace mace { -REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm") + .TypeConstraint("T") + .Build(), + BatchNormOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp); +REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm") + .TypeConstraint("T") + .Build(), + BatchNormOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm") + .TypeConstraint("T") + .Build(), + BatchNormOp); -} // namespace mace \ No newline at end of file +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm") + .TypeConstraint("T") + .Build(), + BatchNormOp); + +} // namespace mace diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index e0d56173d20e89799e7c2f1a9df33a90dbca47bd..4b34de14a0b298dee564bbd1aeab3f1434b2ac4f 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -13,28 +13,45 @@ static void BatchNorm( int iters, int batch, int channels, int height, int width) { mace::testing::StopTiming(); - if ( D == OPENCL ) - OpenCLRuntime::EnableProfiling(); - OpsTestNet net; - OpDefBuilder("BatchNorm", "BatchNormBM") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .Input("Epsilon") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("BatchNorm", "BatchNormBM") + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .Input("Epsilon") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + else { + OpDefBuilder("BatchNorm", "BatchNormBM") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + + // tuning setenv("MACE_TUNING", "1", 1); net.RunOp(D); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index e13df29c33aad74ea730d39696e9cfa66a3f0aac..73e386caab16bbaff893fb56553a5ba3c4d5bae0 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {}; template void Simple() { - // Construct graph OpsTestNet net; - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .Input("Epsilon") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input", {1, 1, 6, 2}, + net.AddInputFromArray("Input", {1, 6, 2, 1}, {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); net.AddInputFromArray("Scale", {1}, {4.0f}); net.AddInputFromArray("Offset", {1}, {2.0}); @@ -32,12 +22,44 @@ void Simple() { net.AddInputFromArray("Var", {1}, {11.67f}); net.AddInputFromArray("Epsilon", {}, {1e-3}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .Input("Epsilon") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } // Check auto expected = - CreateTensor({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, + CreateTensor({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); @@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) { Simple(); } +/* TEST_F(BatchNormOpTest, SimpleNEON) { Simple(); } +*/ TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple(); } +/* TEST_F(BatchNormOpTest, SimpleRandomNeon) { srand(time(NULL)); @@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) { ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); } +*/ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { srand(time(NULL)); @@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { index_t channels = 3 + rand() % 50; index_t height = 64; index_t width = 64; + // Construct graph auto &net = test_net(); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); - // tuning + // run cpu + net.RunOp(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Run on opencl + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .Input("Epsilon") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + + // Tuning setenv("MACE_TUNING", "1", 1); net.RunOp(DeviceType::OPENCL); unsetenv("MACE_TUNING"); // Run on opencl net.RunOp(DeviceType::OPENCL); + net.Sync(); - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run cpu - net.RunOp(); - - ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { @@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { index_t channels = 3 + rand() % 50; index_t height = 103; index_t width = 113; + // Construct graph auto &net = test_net(); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); + // run cpu + net.RunOp(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + + // Run on opencl + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .Input("Epsilon") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + // tuning setenv("MACE_TUNING", "1", 1); net.RunOp(DeviceType::OPENCL); @@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.RunOp(DeviceType::OPENCL); net.Sync(); - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run cpu - net.RunOp(); - - ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } } diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index fa5db7cd470683d97147ee5baf52fb98f3f4753c..61de748b0fc8b8928eb99f8ecdc7e9dc72bca932 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -6,6 +6,9 @@ namespace mace { -REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND") + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); } // namespace mace diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc index d7eeade2620852361844e1e84edb96ecc3b4e281..56711794b7fef1546ec67e63d873289bea2ef1cc 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_to_image.cc @@ -6,6 +6,14 @@ namespace mace { -REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") + .TypeConstraint("T") + .Build(), + BufferToImageOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") + .TypeConstraint("T") + .Build(), + BufferToImageOp); } // namespace mace diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index ea5fbe21592830bcc31ef303311b15aba3b3a98c..43092084d3f75cacf48ecf9dc9dd3fd3861f557d 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector &input_ .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) .Finalize(net.NewOperatorDef()); // Add input data @@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector &input_ .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) .Finalize(net.NewOperatorDef()); // Run @@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) { TestBidirectionTransform(kernels::ARGUMENT, {1}); } +TEST(BufferToImageTest, ArgHalfSmall) { + TestBidirectionTransform(kernels::ARGUMENT, {11}); +} + TEST(BufferToImageTest, ArgMedia) { TestBidirectionTransform(kernels::ARGUMENT, {11}); } @@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) { TEST(BufferToImageTest, Filter3x3Large) { TestBidirectionTransform(kernels::FILTER, {3, 3, 128, 256}); } + +template +void TestDiffTypeBidirectionTransform(const int type, const std::vector &input_shape) { + OpsTestNet net; + OpDefBuilder("BufferToImage", "BufferToImageTest") + .Input("Input") + .Output("B2IOutput") + .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", input_shape); + + // Run + net.RunOp(D); + + OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + .Input("B2IOutput") + .Output("I2BOutput") + .AddIntArg("buffer_type", type) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Check + ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3); +} + +TEST(BufferToImageTest, ArgFloatToHalfSmall) { + TestDiffTypeBidirectionTransform(kernels::ARGUMENT, {11}); +} diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index e76a091c251d01699fe9cc3b9bbdde1791541d82..7d36b1af13034ec0a1d51b451edf3df449f83752 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -6,6 +6,9 @@ namespace mace { -REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle") + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); } // namespace mace diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index ec47971b72babc3c50b2ec78d1a8554f8c7deb38..df040904bff47587143f4580c07516444341a7b6 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -6,6 +6,9 @@ namespace mace { -REGISTER_CPU_OPERATOR(Concat, ConcatOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat") + .TypeConstraint("T") + .Build(), + ConcatOp); } // namespace mace diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index b3886b296d6b01e21bcc414475ae0f03534df5b8..617bd2c5600670513f67140979fd3ccee3ed6c98 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -6,12 +6,31 @@ namespace mace { -REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); + +REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp); +REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); } // namespace mace diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 24211ca1832921c89828b6ec00f45e33a152b77c..b7f6fc731dc0e092d74c5ef6b7434e61e79635f1 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -33,9 +33,9 @@ static void Conv2d(int iters, net.AddRandomInput("Bias", {output_channels}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -44,6 +44,7 @@ static void Conv2d(int iters, .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -54,6 +55,7 @@ static void Conv2d(int iters, .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } @@ -91,39 +93,39 @@ static void Conv2d(int iters, BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); // ICNet -BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); -BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float); +BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half); // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 -BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); +BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 -BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); +BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); +BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 -BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float); +BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); // Test RGB <-> YUV -BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); -BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); - -BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments -BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); -BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); -BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); -BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); -BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); -BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); -BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); +//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); +// +//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments +//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); +//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); +//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); } // namespace mace diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 6120f403b31af34c5689fdd2664ede5924edd826..711bf3891211451429fc3ad0e80e1f55611a4b70 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -84,23 +84,23 @@ TEST_F(Conv2dOpTest, NEONSimple) { TestSimple3x3SAME(); } -template +template void TestNHWCSimple3x3VALID() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -125,33 +126,34 @@ void TestNHWCSimple3x3VALID() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } -template +template void TestNHWCSimple3x3SAME() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -176,6 +179,7 @@ void TestNHWCSimple3x3SAME() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -185,17 +189,17 @@ void TestNHWCSimple3x3SAME() { {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } TEST_F(Conv2dOpTest, CPUSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); } TEST_F(Conv2dOpTest, OPENCLSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); } template @@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) { TestSimple3x3WithoutBias(); } -template +template void TestNHWCSimple3x3WithoutBias() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run @@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() { // Check auto expected = CreateTensor({1, 1, 1, 1}, {18.0f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } TEST_F(Conv2dOpTest, CPUWithoutBias) { - TestNHWCSimple3x3WithoutBias(); + TestNHWCSimple3x3WithoutBias(); } TEST_F(Conv2dOpTest, OPENCLWithoutBias) { - TestNHWCSimple3x3WithoutBias(); + TestNHWCSimple3x3WithoutBias(); } template @@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) { TestCombined3x3(); } -template +template static void TestNHWCCombined3x3() { // Construct graph OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 2}, {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputImage") @@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() { .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() { .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -388,27 +396,22 @@ static void TestNHWCCombined3x3() { {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } -TEST_F(Conv2dOpTest, CPUCombined) { - TestNHWCCombined3x3(); +TEST_F(Conv2dOpTest, CPUStride2) { + TestNHWCCombined3x3(); +} + +TEST_F(Conv2dOpTest, OPENCLStride2) { + TestNHWCCombined3x3(); } template void TestConv1x1() { // Construct graph OpsTestNet net; - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -425,8 +428,37 @@ void TestConv1x1() { {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } // Check auto expected = CreateTensor( @@ -445,11 +477,11 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); } -//TEST_F(Conv2dOpTest, OPENCLConv1x1) { -// TestConv1x1(); -//} +TEST_F(Conv2dOpTest, OPENCLConv1x1) { + TestConv1x1(); +} -template +template static void TestComplexConvNxNS12(const std::vector &shape) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -457,11 +489,11 @@ static void TestComplexConvNxNS12(const std::vector &shape) { srand(time(NULL)); // generate random input - index_t batch = 3 + rand() % 10; + index_t batch = 3 + (rand() % 10); index_t height = shape[0]; index_t width = shape[1]; - index_t input_channels = shape[2] + rand() % 10; - index_t output_channels = shape[3] + rand() % 10; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") @@ -472,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector &shape) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( "Filter", {kernel_h, kernel_w, input_channels, output_channels}); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Bias", {output_channels}); // run on cpu net.RunOp(); @@ -487,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector &shape) { expected.Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -499,25 +532,136 @@ static void TestComplexConvNxNS12(const std::vector &shape) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; - for (int kernel_size : {3}) { - for (int stride : {1}) { + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); } } } TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { - TestComplexConvNxNS12({32, 32, 64, 128}); + TestComplexConvNxNS12({32, 32, 32, 64}); } TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { - TestComplexConvNxNS12({107, 113, 5, 7}); + TestComplexConvNxNS12({107, 113, 5, 7}); +} + +template +static void TestHalfComplexConvNxNS12(const std::vector &input_shape, + const std::vector &filter_shape) { + testing::internal::LogToStderr(); + srand(time(NULL)); + + auto func = [&](int stride_h, int stride_w, Padding padding) { + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t kernel_h = filter_shape[0]; + index_t kernel_w = filter_shape[1]; + index_t input_channels = filter_shape[2] + (rand() % 10); + index_t output_channels = filter_shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + std::vector float_input_data; + GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data); + std::vector float_filter_data; + GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + std::vector float_bias_data; + GenerateRandomRealTypeData({output_channels}, float_bias_data); + // Add input data + net.AddInputFromArray("Input", {batch, height, width, input_channels}, float_input_data); + net.AddInputFromArray( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + net.AddInputFromArray("Bias", {output_channels}, float_bias_data); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); + }; + + for (int stride : {1, 2}) { + func(stride, stride, VALID); + func(stride, stride, SAME); + } +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { + TestHalfComplexConvNxNS12({32, 32}, + {1, 1, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { + TestHalfComplexConvNxNS12({32, 32}, + {3, 3, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { + TestHalfComplexConvNxNS12({32, 32}, + {15, 1, 256, 2}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { + TestHalfComplexConvNxNS12({32, 32}, + {1, 15, 256, 2}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { + TestHalfComplexConvNxNS12({32, 32}, + {7, 7, 3, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { + TestHalfComplexConvNxNS12({107, 113}, + {1, 1, 5, 7}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { + TestHalfComplexConvNxNS12({107, 113}, + {3, 3, 5, 7}); } diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 992a6f2aa4584b6a9c5a1378885237fd19af6725..b8cb2e5be759a4838351ceb0405f075a3bbbf364 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -6,15 +6,21 @@ namespace mace { -REGISTER_CPU_OPERATOR(DepthwiseConv2d, +REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d") + .TypeConstraint("T") + .Build(), DepthwiseConv2dOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(DepthwiseConv2d, +REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d") + .TypeConstraint("T") + .Build(), DepthwiseConv2dOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(DepthwiseConv2d, +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d") + .TypeConstraint("T") + .Build(), DepthwiseConv2dOp); } // namespace mace diff --git a/mace/ops/fused_conv_2d.cc b/mace/ops/fused_conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e6b0172f9e04cd2d0a098cd701431506856f7f9 --- /dev/null +++ b/mace/ops/fused_conv_2d.cc @@ -0,0 +1,30 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/fused_conv_2d.h" + +namespace mace { + +REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +} // namespace mace diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..c6baafeaa27365141168511facafb68cc3573073 --- /dev/null +++ b/mace/ops/fused_conv_2d.h @@ -0,0 +1,46 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_FUSED_CONV_2D_H_ +#define MACE_OPS_FUSED_CONV_2D_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/fused_conv_2d.h" +#include "mace/ops/conv_pool_2d_base.h" + +namespace mace { + +template +class FusedConv2dOp : public ConvPool2dOpBase { + public: + FusedConv2dOp(const OperatorDef &op_def, Workspace *ws) + : ConvPool2dOpBase(op_def, ws), + functor_(this->strides_.data(), this->padding_, + this->dilations_.data()) { + } + + bool Run() override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + + functor_(input, filter, bias, output); + + return true; + } + + private: + kernels::FusedConv2dFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, FILTER, BIAS); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_FUSED_CONV_2D_H_ diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..896fbbc6ae700ce99968414c052c1ae07119c49c --- /dev/null +++ b/mace/ops/fused_conv_2d_test.cc @@ -0,0 +1,410 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/fused_conv_2d.h" +#include "mace/ops/ops_test_util.h" + +using namespace mace; + +class FusedConv2dOpTest : public OpsTestBase {}; + +template +void TestNHWCSimple3x3VALID() { + OpsTestNet net; + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + net.AddInputFromArray("Bias", {1}, {-0.1f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +template +void TestNHWCSimple3x3SAME() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + net.AddInputFromArray("Bias", {1}, {-0.1f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {1, 3, 3, 1}, + {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +TEST_F(FusedConv2dOpTest, CPUSimple) { + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); +} + +TEST_F(FusedConv2dOpTest, OPENCLSimple) { + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); +} + +template +void TestNHWCSimple3x3WithoutBias() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + // Check + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +TEST_F(FusedConv2dOpTest, CPUWithoutBias) { + TestNHWCSimple3x3WithoutBias(); +} + +TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) { + TestNHWCSimple3x3WithoutBias(); +} + +template +void TestConv1x1() { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 10, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + net.AddInputFromArray( + "Filter", {1, 1, 5, 2}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + // Check + auto expected = CreateTensor( + {1, 3, 10, 2}, + {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); +} + +TEST_F(FusedConv2dOpTest, CPUConv1x1) { + TestConv1x1(); +} + +TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { + TestConv1x1(); +} + +template +static void TestComplexConvNxNS12(const std::vector &shape) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}); + net.AddRandomInput("Bias", {output_channels}); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); + }; + + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); + func(kernel_size, kernel_size, stride, stride, SAME); + } + } +} + +TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) { + TestComplexConvNxNS12({107, 113, 5, 7}); +} + +template +static void TestHalfComplexConvNxNS12(const std::vector &shape) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + std::vector float_input_data; + GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data); + std::vector float_filter_data; + GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + std::vector float_bias_data; + GenerateRandomRealTypeData({output_channels}, float_bias_data); + // Add input data + net.AddInputFromArray("Input", {batch, height, width, input_channels}, float_input_data); + net.AddInputFromArray( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + net.AddInputFromArray("Bias", {output_channels}, float_bias_data); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); + }; + + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); + } + } +} + +TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) { + TestHalfComplexConvNxNS12({32, 32, 32, 64}); +} + diff --git a/mace/ops/global_avg_pooling.cc b/mace/ops/global_avg_pooling.cc index d507d76fa63ed34c02761c551142faa6a9886a0d..534378445ca59b05af2d5c7e89b46d198b14c4f4 100644 --- a/mace/ops/global_avg_pooling.cc +++ b/mace/ops/global_avg_pooling.cc @@ -6,11 +6,15 @@ namespace mace { -REGISTER_CPU_OPERATOR(GlobalAvgPooling, +REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling") + .TypeConstraint("T") + .Build(), GlobalAvgPoolingOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(GlobalAvgPooling, +REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling") + .TypeConstraint("T") + .Build(), GlobalAvgPoolingOp); #endif // __ARM_NEON diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/image_to_buffer.cc index f41d7475cb9282bae2ff5c23bb3c246738e40774..bcf8b997b2b6da5620bdb340c785e47f37915b37 100644 --- a/mace/ops/image_to_buffer.cc +++ b/mace/ops/image_to_buffer.cc @@ -6,6 +6,14 @@ namespace mace { -REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer") + .TypeConstraint("T") + .Build(), + ImageToBufferOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer") + .TypeConstraint("T") + .Build(), + ImageToBufferOp); } // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 6bdf5db5b8835766304299c679f974a32376bf6c..8d593940cf0c5059d5064a27c7edb3558b9f559b 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -13,6 +13,7 @@ #include "mace/core/tensor.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { @@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector &shape, std::vector &res) { std::random_device rd; std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); + std::normal_distribution nd(0, 1); index_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); res.resize(size); - std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); + if (DataTypeToEnum::value == DT_HALF) { + std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast(nd(gen)); }); + } else { + std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); + } } template @@ -289,39 +294,40 @@ inline void ExpectEqual(const double &a, const double &b) { EXPECT_DOUBLE_EQ(a, b); } -inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) { - ASSERT_EQ(x.dtype(), y.dtype()); +inline void AssertSameDims(const Tensor &x, const Tensor &y) { ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs " << "y.shape [ " << ShapeToString(y) << "]"; } -template ::value> +template ::value> struct Expector; // Partial specialization for float and double. -template -struct Expector { - static void Equal(const T &a, const T &b) { ExpectEqual(a, b); } +template +struct Expector { + static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); } static void Equal(const Tensor &x, const Tensor &y) { - ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); - AssertSameTypeDims(x, y); + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + ASSERT_EQ(y.dtype(), DataTypeToEnum::v()); + AssertSameDims(x, y); Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard y_mapper(&y); - auto a = x.data(); - auto b = y.data(); + auto a = x.data(); + auto b = y.data(); for (int i = 0; i < x.size(); ++i) { ExpectEqual(a(i), b(i)); } } static void Near(const Tensor &x, const Tensor &y, const double abs_err) { - ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); - AssertSameTypeDims(x, y); + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + ASSERT_EQ(y.dtype(), DataTypeToEnum::v()); + AssertSameDims(x, y); Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard y_mapper(&y); - auto a = x.data(); - auto b = y.data(); + auto a = x.data(); + auto b = y.data(); for (int i = 0; i < x.size(); ++i) { EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b << " index = " << i; @@ -334,17 +340,18 @@ template void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { static_assert(is_floating_point_type::value, "T is not a floating point type"); - Expector::Near(x, y, abs_err); + Expector::Near(x, y, abs_err); } -template -std::string ToString(const T &input) { - std::stringstream ss; - ss << input; - return ss.str(); +template +void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { + static_assert(is_floating_point_type::value + && is_floating_point_type::value, + "T is not a floating point type"); + Expector::Near(x, y, abs_err); } -template +template void BufferToImage(OpsTestNet &net, const std::string &input_name, const std::string &output_name, @@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net, .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run @@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net, net.Sync(); } -template +template void ImageToBuffer(OpsTestNet &net, const std::string &input_name, const std::string &output_name, @@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net, .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 1c4f1af2f55c9f8ea5f2455f3bf6d0ad84f36ac7..17031378f7e93ac6924f794ec352d3009181179d 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -6,11 +6,29 @@ namespace mace { -REGISTER_CPU_OPERATOR(Pooling, PoolingOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(Pooling, PoolingOp); +REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); + } // namespace mace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index f62992f53ed44abae64383e300b873433e9b0216..bbc653ab75d627a412d5fcdfaf5c67772658f24f 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); - std::vector output_shape(4); - std::vector paddings(2); - std::vector filter_shape(4); - // TODO(chenghui): is it kind of a hack? - filter_shape[0] = input->shape()[1]; - filter_shape[1] = input->shape()[0]; - filter_shape[2] = kernels_[0]; - filter_shape[3] = kernels_[1]; - - kernels::CalcPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), this->dilations_.data(), - this->strides_.data(), this->padding_, output_shape.data(), - paddings.data()); - output->Resize(output_shape); - functor_(input, output); return true; }; diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index bf2b182467426292a2fef53dddd0a0d9d3b09dfc..dcda06b75483e6e0e01cfe16594991d72171d2bf 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Add input data net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); // Run net.RunOp(); // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); + CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, AVG_VALID) { - // Construct graph - auto &net = test_net(); - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::AVG) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - - // Run - net.RunOp(); - - // Check - auto expected = CreateTensor( - {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); -} TEST_F(PoolingOpTest, MAX_SAME) { // Construct graph @@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input", {1, 1, 3, 3}, - {0, 1, 2, 3, 4, 5, 6, 7, 8}); + net.AddInputFromArray("Input", {1, 3, 3, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {4, 5, 7, 8}); + auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 4, 4}, + "Input", {1, 4, 4, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {10, 11, 14, 15}); + auto expected = CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 2, 9}, + "Input", {1, 2, 9, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); // Run - net.RunOp(DeviceType::NEON); + net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 1, 5}, {10, 12, 14, 16, 17}); + auto expected = CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } - -template +template static void SimpleMaxPooling3S2() { // Construct graph OpsTestNet net; - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 1, 3, 9}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + "Input", {1, 3, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); - // Run - net.RunOp(D); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + // Run + OpDefBuilder("Pooling", "PoolingTest") + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + } // Check - auto expected = CreateTensor({1, 1, 1, 4}, {20, 22, 24, 26}); + auto expected = CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) { - SimpleMaxPooling3S2(); -} + TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -template -static void AlignedMaxPooling3S2(Padding padding) { +template +static void MaxPooling3S2(const std::vector &input_shape, + const std::vector strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") @@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) { .Output("Output") .AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 64, 64}); - // Run - net.RunOp(D); + net.AddRandomInput("Input", input_shape); + + // run on cpu + net.RunOp(); Tensor expected; expected.Copy(*net.GetOutput("Output")); - // Run on cpu - net.RunOp(); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); } // TODO(chenghui) : there is a bug. @@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) { //} TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { - AlignedMaxPooling3S2(Padding::VALID); - AlignedMaxPooling3S2(Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) { + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); } -template -static void UnalignedMaxPooling3S2(Padding padding) { +TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::SAME); +} + +TEST_F(PoolingOpTest, AVG_VALID) { // Construct graph - OpsTestNet net; + auto &net = test_net(); OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", padding) + .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::AVG) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 113, 43, 47}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddInputFromArray( + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - // Run on cpu + // Run net.RunOp(); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); -} - -// TODO(chenghui) : there is a bug. -//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) { -// UnalignedMaxPooling3S2(); -//} + // Check + auto expected = CreateTensor( + {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); -TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { - UnalignedMaxPooling3S2(Padding::VALID); - UnalignedMaxPooling3S2(Padding::SAME); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template static void SimpleAvgPoolingTest() { // Construct graph OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 2, 8, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 1, 2, 8}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); // Check - auto expected = CreateTensor({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5}); + auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, NEONSimpleAvgPooling) { - SimpleAvgPoolingTest(); -} - TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest(); } -template -static void AlignedAvgPoolingTest(Padding padding) { +template +static void AvgPoolingTest(const std::vector &shape, + const std::vector &kernels, + const std::vector &strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {4, 4}) - .AddIntsArg("strides", {4, 4}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 15, 15}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddRandomInput("Input", shape); - // Run on cpu + // run on cpu net.RunOp(); + Tensor expected; + expected.Copy(*net.GetOutput("Output")); - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); -} - -TEST_F(PoolingOpTest, NEONAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -template -static void UnAlignedAvgPoolingTest(Padding padding) { - // Construct graph - OpsTestNet net; + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {7, 7}) - .AddIntsArg("strides", {7, 7}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddRandomInput("Input", {3, 128, 31, 37}); - // Run net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - // Run on cpu - net.RunOp(); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.01); +} - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); +TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); } -TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); +TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) { + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME); } + +TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME); +} + diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc index 3365439398af6d1aded3d1f28304958da097b7ab..f471ae64665f34ed9b109fdf5c3f2c1c79ce7320 100644 --- a/mace/ops/relu.cc +++ b/mace/ops/relu.cc @@ -6,10 +6,16 @@ namespace mace { -REGISTER_CPU_OPERATOR(Relu, ReluOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu") + .TypeConstraint("T") + .Build(), + ReluOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(Relu, ReluOp); +REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu") + .TypeConstraint("T") + .Build(), + ReluOp); #endif // __ARM_NEON REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu") diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index b8b24ced3b006c88bdd449e923d32c47b79567b7..8eae71819537a99cc08454e1585844f7d77f52e3 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -6,14 +6,26 @@ namespace mace { -REGISTER_CPU_OPERATOR(ResizeBilinear, ResizeBilinearOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("ResizeBilinear") + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); #if __ARM_NEON -REGISTER_NEON_OPERATOR(ResizeBilinear, +REGISTER_NEON_OPERATOR(OpKeyBuilder("ResizeBilinear") + .TypeConstraint("T") + .Build(), ResizeBilinearOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(ResizeBilinear, +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear") + .TypeConstraint("T") + .Build(), ResizeBilinearOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear") + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); + } // namespace mace diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 8429fd6bee0f8617e98268cd4ce97be43935a44c..d9453908c11bff15ad8ee3c996af03523d6fb7d1 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters, mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") - .Input("Input") - .Input("OutSize") - .Output("Output") - .AddIntsArg("size", {output_height, output_width}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", - {batch, channels, input_height, input_width}); + {batch, input_height, input_width, channels}); net.AddInputFromArray("OutSize", {2}, {output_height, output_width}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") + .Input("InputImage") + .Input("OutSize") + .Output("OutputImage") + .AddIntsArg("size", {output_height, output_width}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") + .Input("Input") + .Input("OutSize") + .Output("Output") + .AddIntsArg("size", {output_height, output_width}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } // Warm-up for (int i = 0; i < 5; ++i) { @@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters, #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE) \ BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, NEON); \ BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL); +// SNPE 835 GPU: 6870us +BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half); +BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float); + BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float); BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float); BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float); diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 7b7cee9d97da3afd98e80ff710815f06cf1d8eef..3e50c3b4c15133238fb2e7b937430dc8d13dffdd 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { // Add input data vector input(24); std::iota(begin(input), end(input), 0); - net.AddInputFromArray("Input", {1, 3, 2, 4}, input); + net.AddInputFromArray("Input", {1, 2, 4, 3}, input); net.AddInputFromArray("OutSize", {2}, {1, 2}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18}); + auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { // Add input data vector input(24); std::iota(begin(input), end(input), 0); - net.AddInputFromArray("Input", {1, 3, 2, 4}, input); + net.AddInputFromArray("Input", {1, 2, 4, 3}, input); net.AddInputFromArray("OutSize", {2}, {1, 2}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19}); + auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -65,6 +65,7 @@ template void TestRandomResizeBilinear() { srand(time(nullptr)); testing::internal::LogToStderr(); + for (int round = 0; round < 10; ++round) { int batch = 1 + rand() % 5; int channels = 1 + rand() % 100; @@ -72,39 +73,54 @@ void TestRandomResizeBilinear() { int width = 1 + rand() % 100; int in_height = 1 + rand() % 100; int in_width = 1 + rand() % 100; + int align_corners = rand() % 1; // Construct graph OpsTestNet net; - OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") - .Input("Input") - .Input("OutSize") - .Output("Output") - .AddIntArg("align_corners", 1) - .AddIntsArg("size", {height, width}) - .Finalize(net.NewOperatorDef()); - // Add input data net.AddRandomInput("Input", - {batch, channels, in_height, in_width}); + {batch, in_height, in_width, channels}); net.AddInputFromArray("OutSize", {2}, {height, width}); - // Run - net.RunOp(D); - Tensor actual; - actual.Copy(*net.GetOutput("Output")); - + OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") + .Input("Input") + .Input("OutSize") + .Output("Output") + .AddIntArg("align_corners", align_corners) + .AddIntsArg("size", {height, width}) + .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - Tensor *expected = net.GetOutput("Output"); + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") + .Input("InputImage") + .Input("OutSize") + .Output("OutputImage") + .AddIntArg("align_corners", align_corners) + .AddIntsArg("size", {height, width}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT); + } else { + // TODO support NEON + } // Check - ExpectTensorNear(*expected, actual, 0.001); + ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 0.001); } } +/* TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) { TestRandomResizeBilinear(); } +*/ TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) { TestRandomResizeBilinear(); diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index 8a7af417768038f6cb66048a375bb6e5ff8fa402..fec9866872e94aa4aa1dd2f218d0585ebdc776c1 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -6,6 +6,9 @@ namespace mace { -REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("SpaceToBatchND") + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); } // namespace mace diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 8e6808219ce27cbe4f122a1e4b909c71f7bab7a3..119e1fed79a7cad1374cdb3891745ec2c83716bb 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -67,12 +67,20 @@ message NodeInput { optional int32 output_port = 2; } +message OutputShape { + repeated int64 dims = 1; +} + message OperatorDef { repeated string input = 1; repeated string output = 2; optional string name = 3; optional string type = 4; repeated Argument arg = 5; + repeated OutputShape output_shape = 6; + + // Memory optimization: only support one single output op + optional int32 mem_id = 10 [default = -1]; // for hexagon mace-nnlib optional uint32 node_id = 100; @@ -82,6 +90,16 @@ message OperatorDef { repeated int32 out_max_byte_size = 104; // only support 32-bit len } +// for memory optimization +message MemoryBlock { + optional int32 mem_id = 1; + optional uint32 x = 2; + optional uint32 y = 3; +} +message MemoryArena { + repeated MemoryBlock mem_block = 1; +} + // for hexagon mace-nnlib message InputInfo { optional string name = 1; diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py index fbf19f5b7cf8d705683a959bece07067ac43a5f9..d30a463ca2bce938d716e799f82049308e044586 100644 --- a/mace/python/tools/tf_converter.py +++ b/mace/python/tools/tf_converter.py @@ -21,7 +21,7 @@ def main(unused_args): if FLAGS.runtime == 'dsp': output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( - input_graph_def, FLAGS.input_node, FLAGS.output_node) + input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize) else: output_graph_def = tf_converter_lib.convert_to_mace_pb( input_graph_def) @@ -62,6 +62,11 @@ def parse_args(): type=str, default="softmax", help="e.g., softmax") + parser.add_argument( + "--prequantize", + type=bool, + default=False, + help="e.g., False") return parser.parse_known_args() diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 97575bf23ce9583f1db75ce37d5bc699d0f0189e..27df84accf8859a20454f4c512ce688ccea8081a 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -18,15 +18,6 @@ def convert_tensor(op, tensor): tensor.name = op.outputs[0].name shape = list(tf_tensor.shape) - if (op.name.find('pointwise_kernel') != -1 or - op.name.find('depthwise_kernel') != -1 or - op.name.endswith('weights') or - op.name.endswith('kernel')) \ - and op.outputs[0].consumers()[0].type.find('Conv') != -1: - if op.outputs[0].consumers()[0].get_attr('data_format') == 'NHWC': - tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1)) - shape = [shape[3], shape[2], shape[0], shape[1]] - # print (tensor.name, shape) tensor.dims.extend(shape) tf_dt = op.get_attr('dtype') @@ -66,6 +57,12 @@ def convert_ops(unresolved_ops, net_def): op_def.type = first_op.type op_def.input.extend([input.name for input in first_op.inputs]) op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) padding_arg = op_def.arg.add() padding_arg.name = 'padding' padding_arg.i = padding_mode[first_op.get_attr('padding')] @@ -74,7 +71,7 @@ def convert_ops(unresolved_ops, net_def): strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NCHW' + data_format_arg.s = 'NHWC' if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd': bias_add_op = unresolved_ops[1] @@ -105,6 +102,12 @@ def convert_ops(unresolved_ops, net_def): op_def.type = 'BatchNorm' op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) op_def.output.extend([output.name for output in add_1_op.outputs]) + output_shapes = [] + for output in add_1_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) resolved_count = 7 elif first_op.type == 'Relu6': @@ -113,6 +116,12 @@ def convert_ops(unresolved_ops, net_def): op_def.type = 'Relu' op_def.input.extend([input.name for input in first_op.inputs]) op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) max_limit_arg = op_def.arg.add() max_limit_arg.name = 'max_limit' max_limit_arg.f = 6 @@ -122,6 +131,12 @@ def convert_ops(unresolved_ops, net_def): op_def.type = 'Pooling' op_def.input.extend([input.name for input in first_op.inputs]) op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) pooling_type_arg = op_def.arg.add() pooling_type_arg.name = 'pooling_type' pooling_type_arg.i = pooling_type_mode[first_op.type] @@ -136,21 +151,46 @@ def convert_ops(unresolved_ops, net_def): kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NCHW' + data_format_arg.s = 'NHWC' elif first_op.type == 'Add': op_def = net_def.op.add() op_def.name = first_op.name op_def.type = "AddN" op_def.input.extend([input.name for input in first_op.inputs]) op_def.output.extend([output.name for output in first_op.outputs]) - elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']: + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + elif first_op.type == 'ConcatV2': + op_def = net_def.op.add() + op_def.name = first_op.name + op_def.type = "Concat" + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', + 'BatchToSpaceND', 'BiasAdd', 'FusedBatchNorm']: op_def = net_def.op.add() op_def.name = first_op.name op_def.type = first_op.type op_def.input.extend([input.name for input in first_op.inputs]) op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) else: - raise Exception('Unknown Op: ' + first_op.name) + raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type)) pass for i in range(resolved_count): diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py index 8f925059279d2b50b13fc28aaf1aca975ec67bc7..ced16ce853e8f49b9c968e09ed257a8e3bf815b5 100644 --- a/mace/python/tools/tf_dsp_converter_lib.py +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -5,7 +5,7 @@ from dsp_ops import DspOps from mace.python.tools import graph_util # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \ -# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node +# --runtime dsp --input_node input_node --output_node output_node padding_mode = { 'NA': 0, @@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def): for follow_op in follow_ops: new_follow_op = mace_pb2.OperatorDef() new_follow_op.CopyFrom(follow_op) - for i in range(len(follow_op.input)): - for k in range(3): + for i in xrange(len(follow_op.input)): + for k in xrange(3): if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) new_ops.append(new_follow_op) @@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def): new_net_def = mace_pb2.NetDef() new_net_def.tensors.extend(tensor_map.values()) - for op in net_def.op: - if op.name not in skip_ops: - new_net_def.op.extend([op]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) new_net_def.op.extend(new_ops) return new_net_def @@ -249,29 +247,101 @@ def add_node_id(net_def): return net_def -def add_input_output_info(net_def, input_node, output_node, graph): +def add_input_output_info(net_def, input_node, output_node, graph, dtype): input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) - for op in net_def.op: - if op.name == input_node: + input_info = net_def.input_info.add() + input_info.dims.extend(input_tensor.shape.as_list()) + input_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): input_info = net_def.input_info.add() - input_info.name = op.name - input_info.node_id = op.node_id - input_info.dims.extend(input_tensor.shape.as_list()) - input_info.max_byte_size = max_elem_size(input_tensor) - input_info.data_type = find_dtype(input_tensor.dtype) - elif op.name == output_node: + input_info.dims.extend([1,1,1,1]) + input_info.data_type = mace_pb2.DT_FLOAT + + output_info = net_def.output_info.add() + output_info.dims.extend(output_tensor.shape.as_list()) + output_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): output_info = net_def.output_info.add() - output_info.name = op.name - output_info.node_id = op.node_id - output_info.dims.extend(output_tensor.shape.as_list()) - output_info.max_byte_size = max_elem_size(output_tensor) - output_info.data_type = find_dtype(output_tensor.dtype) + output_info.dims.extend([1,1,1,1]) + output_info.data_type = mace_pb2.DT_FLOAT return net_def -def convert_to_mace_pb(input_graph_def, input_node, output_node): +def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node): + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + skip_ops = set() + new_ops = [] + skip_tensors = set() + + # INPUT->Flatten->Minf, Maxf->Quantize + for op in net_def.op: + if op.type == 'INPUT': + input_op = op + flatten_op = None + quantize_op = None + for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: + if o.type == 'Flatten': + flatten_op = o + elif o.type == 'Quantize': + quantize_op = o + if quantize_op is not None: + minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)] + skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name]) + skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) + + new_input_op = mace_pb2.OperatorDef() + new_input_op.name = input_op.name + new_input_op.type = input_op.type + new_input_op.padding = input_op.padding + new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4]) + new_ops.append(new_input_op) + for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]: + new_follow_op = mace_pb2.OperatorDef() + new_follow_op.CopyFrom(follow_op) + for i in xrange(len(follow_op.input)): + for k in xrange(3): + if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k): + new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k) + new_ops.append(new_follow_op) + skip_ops.add(follow_op.name) + + elif op.type == 'OUTPUT': + output_op = op + dequantize_op = get_node_from_map(op_map, output_op.input[0]) + if dequantize_op.type == 'Dequantize': + skip_ops = skip_ops.union([dequantize_op.name, output_op.name]) + + new_output_op = mace_pb2.OperatorDef() + new_output_op.name = output_op.name + new_output_op.type = output_op.type + new_output_op.input.extend(dequantize_op.input) + new_ops.append(new_output_op) + + + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + return new_net_def + +def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False): """ nnlib does not have batch norm, so use tensorflow optimizer to fold batch norm with convolution. The fold optimization reorders ops, so @@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node): add_output_node(net_def, output_node) # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def) + + if prequantize: + net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node) + sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') net_def_with_node_id = add_node_id(sorted_net_def) - final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph) + if prequantize: + dtype = mace_pb2.DT_UINT8 + else: + dtype = mace_pb2.DT_FLOAT + final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype) return final_net_def diff --git a/mace/python/tools/tf_ops_stats.py b/mace/python/tools/tf_ops_stats.py index 9301b3f1a5d8537418704ea9b73e50c30460f545..d60487a96434bf1fbda63f0bb456a973e4c07b9b 100644 --- a/mace/python/tools/tf_ops_stats.py +++ b/mace/python/tools/tf_ops_stats.py @@ -68,7 +68,7 @@ def main(unused_args): if input_name.endswith('weights:0') and input_name in tensor_shapes: ksize = tensor_shapes[input_name] break - print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape.as_list(), op.outputs[0].shape.as_list())) + print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape)) key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format) hist_inc(stats, key) elif op.type in ['FusedResizeAndPadConv2D']: @@ -92,6 +92,7 @@ def main(unused_args): size = tensor_values[input_name] break key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners) + print(key) hist_inc(stats, key) elif op.type in ['AvgPool', 'MaxPool']: padding = op.get_attr('padding') diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 536a7fb8805bc136e3b235151bbfb433b6c96836..a8b13828de5208047292218a27d76e3f328923b7 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -6,6 +6,7 @@ #define MACE_UTILS_UTILS_H_ #include +#include namespace mace { template @@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() { return static_cast(tv.tv_sec * 1000000 + tv.tv_usec); } +template +inline std::string ToString(T v) { + std::ostringstream ss; + ss << v; + return ss.str(); +} + } // namespace mace #endif // MACE_UTILS_UTILS_H_ diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh index fbd4fa007803aa2e6939485ca6b1601ad6b56dc1..b41d4d140303d8b682c49d40d23a35abe81b68c3 100755 --- a/tools/bazel-adb-run.sh +++ b/tools/bazel-adb-run.sh @@ -22,7 +22,10 @@ ANDROID_ABI=arm64-v8a STRIP="" STRIP="--strip always" -bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI +# for profiling +bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI --define profiling=true +#bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI + if [ $? -ne 0 ]; then exit 1 fi