diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a86f2eb1729d784135f5499da6dcc7ba66af776..89916b0959ed482f0c9dcdfd767945d43643f21b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -106,7 +106,7 @@ mace_cc_test: GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//micro/test/ccunit:micro_ops_test" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a mace_cc_benchmark: @@ -133,7 +133,7 @@ model_tests: fi - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file --cl_mem_type=buffer - - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file + - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc index 8b2fc40281fe7942401ea775f03da129ce8551e6..079986798de722bf7f7440fa8307b6a28814a44e 100644 --- a/mace/core/net_def_adapter.cc +++ b/mace/core/net_def_adapter.cc @@ -15,6 +15,8 @@ #include "mace/core/net_def_adapter.h" #include +#include +#include #include #include "mace/core/ops/operator.h" @@ -27,6 +29,40 @@ namespace mace { namespace { + +struct FilterRulerInfo { + const int filter_idx; + const std::vector nhwc_oihw; + const std::vector nchw_oihw; + FilterRulerInfo(int filter_index, const std::vector nhwc2oihw, + const std::vector nchw2oihw) + : filter_idx(filter_index), + nhwc_oihw(std::move(nhwc2oihw)), nchw_oihw(std::move(nchw2oihw)) {} +}; + +typedef std::unordered_map< + std::string, std::unique_ptr> FilterTransposeRuler; + +FilterTransposeRuler GetFilterTransposeRuler() { + FilterTransposeRuler filter_ruler; + // filter's src format is actually HWIO in tf, OIHW in others + // for Conv2D in MACE, the dst format is OIHW + filter_ruler.emplace("Conv2D", make_unique( + 1, std::vector({3, 2, 0, 1}), std::vector({}))); + + // filter's src format is actually HWOI in tf, MIHW in others + filter_ruler.emplace("Deconv2D", make_unique( + 1, std::vector({2, 3, 0, 1}), std::vector({}))); + + filter_ruler.emplace("DepthwiseConv2d", make_unique( + 1, std::vector({3, 2, 0, 1}), std::vector({}))); + + filter_ruler.emplace("DepthwiseDeconv2d", make_unique( + 1, std::vector({2, 3, 0, 1}), std::vector({}))); + + return filter_ruler; +} + DataFormat GetDefaultDataFormat(DeviceType device_type, bool is_quantized_model) { if (device_type == CPU) { @@ -323,7 +359,7 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context, VLOG(3) << "Adapt device for op " << op_def->name(); DeviceType target_device_type = target_device->device_type(); DeviceType device_type = DeviceType::CPU; - context->set_device(cpu_device); + context->set_device(target_device); if (target_device_type != DeviceType::CPU) { std::vector producer_devices; for (auto input : op_def->input()) { @@ -344,9 +380,8 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context, target_device_type, available_devices, producer_devices); - if (device_type == target_device_type) { - context->set_device(target_device); - } else { + if (device_type != target_device_type) { + context->set_device(cpu_device); LOG(INFO) << "Op " << op_def->name() << "(" << op_def->type() << ")" << " fall back to CPU"; } @@ -411,7 +446,7 @@ MaceStatus NetDefAdapter::AdaptDataFormat( target_mem_type = MemoryType::GPU_BUFFER; } auto inputs_data_format = op_registry_->InputsDataFormat(op_def->type(), - context); + context); DataFormat src_df, dst_df; int input_size = op_def->input_size(); for (int i = 0; i < input_size; ++i) { @@ -425,71 +460,13 @@ MaceStatus NetDefAdapter::AdaptDataFormat( } src_df = output_map->at(op_def->input(i)).data_format; dst_df = inputs_data_format[i]; - if (src_df != DataFormat::NONE - && dst_df != DataFormat::NONE - && output_map->at(op_def->input(i)).shape.size() == 4 - && src_df != dst_df) { - std::string transformed_name = TransformedName(op_def->input(i), - "data_format", static_cast(dst_df)); - if (transformed_set->count(transformed_name) == 0) { - VLOG(1) << "Add Transpose operation " << op_def->name() - << " to transpose tensor " - << op_def->input(i) << "', from data format " - << static_cast(src_df) << " to " - << static_cast(dst_df); - // Only support transpose between NHWC and NCHW for now. - std::vector dst_dims; - if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) { - dst_dims = {0, 2, 3, 1}; - } else if (src_df == DataFormat::NHWC && dst_df == DataFormat::NCHW) { - dst_dims = {0, 3, 1, 2}; - } else { - LOG(FATAL) << "Encounter unsupported data format transpose from " - << static_cast(src_df) << " to " - << static_cast(dst_df); - } - auto &input_info = output_map->at(op_def->input(i)); - auto output_shape = input_info.shape.empty() ? - std::vector() : - TransposeShape(input_info.shape, - dst_dims); - OperatorDef *transpose_op_def = target_net_def->add_op(); - BuildTransposeOpDef( - op_def->input(i), - transformed_name, - output_shape, - dst_dims, - input_info.dtype, - DeviceType::CPU, - transpose_op_def); - // set data format arg - SetProtoArg(transpose_op_def, - "data_format", - static_cast(dst_df)); - // set output memory type argument - SetProtoArg(transpose_op_def, - OutputMemoryTypeTagName(), - target_mem_type); - // update tensor consumer information - output_map->at(op_def->input(i)).consumer_op_indices.push_back( - target_net_def->op_size() - 1); - // update output information map - output_map->emplace( - transformed_name, - InternalOutputInfo( - target_mem_type, - input_info.dtype, - dst_df, - output_shape, - target_net_def->op_size() - 1)); - // update tensor shape map - tensor_shape_map->emplace(transformed_name, output_shape); - // record transformed tensors - transformed_set->insert(transformed_name); - } - // update original op_def's input - op_def->set_input(i, transformed_name); + const std::vector dst_dims = + GetDstDimsFromTransposeRuler(output_map, op_def, i, src_df, dst_df); + if (dst_dims.size() > 0) { + AddTranposeOpForDataFormat(output_map, tensor_shape_map, transformed_set, + target_net_def, target_mem_type, + op_def, i, dst_df, dst_dims); } } return MaceStatus::MACE_SUCCESS; @@ -590,6 +567,86 @@ MaceStatus NetDefAdapter::AdaptMemoryType( return MaceStatus::MACE_SUCCESS; } +std::vector NetDefAdapter::GetDstDimsFromTransposeRuler( + TensorInfoMap *output_map, const OperatorDef *op_def, const int input_idx, + const DataFormat src_df, const DataFormat dst_df) { + std::vector dst_dims; + if (src_df == DataFormat::NONE || dst_df == DataFormat::NONE + || output_map->at(op_def->input(input_idx)).shape.size() != 4) { + return dst_dims; + } + + if (src_df != dst_df) { // for other operators + bool transposable = false; + if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) { + dst_dims = {0, 2, 3, 1}; + transposable = true; + } else if (src_df == DataFormat::NHWC && dst_df == DataFormat::NCHW) { + dst_dims = {0, 3, 1, 2}; + transposable = true; + } else if (dst_df == DataFormat::OIHW) { + static const auto filter_transpose_ruler = GetFilterTransposeRuler(); + auto &op_type = op_def->type(); + MACE_CHECK((filter_transpose_ruler.count(op_type) > 0) && + filter_transpose_ruler.at(op_type)->filter_idx == input_idx); + if (src_df == DataFormat::NCHW) { + dst_dims = filter_transpose_ruler.at(op_type)->nchw_oihw; + transposable = true; + } else if (src_df == DataFormat::NHWC) { + dst_dims = filter_transpose_ruler.at(op_type)->nhwc_oihw; + transposable = true; + } + } + if (!transposable) { + LOG(FATAL) << "Encounter unsupported data format transpose from " + << static_cast(src_df) << " to " + << static_cast(dst_df); + } + } + + return dst_dims; +} + +MaceStatus NetDefAdapter::AddTranposeOpForDataFormat( + TensorInfoMap *output_map, TensorShapeMap *tensor_shape_map, + std::unordered_set *transformed_set, NetDef *target_net_def, + MemoryType target_mem_type, OperatorDef *op_def, const int i, + const DataFormat dst_df, const std::vector &dst_dims) { + std::string transformed_name = TransformedName( + op_def->input(i), "data_format", MakeString(dst_dims)); + if (transformed_set->count(transformed_name) == 0) { + auto &input_info = output_map->at(op_def->input(i)); + auto output_shape = input_info.shape.empty() ? + std::vector() : + TransposeShape(input_info.shape, + dst_dims); + OperatorDef *transpose_op_def = target_net_def->add_op(); + BuildTransposeOpDef(op_def->input(i), transformed_name, output_shape, + dst_dims, input_info.dtype, DeviceType::CPU, + transpose_op_def); + // set data format arg + SetProtoArg(transpose_op_def, "data_format", static_cast(dst_df)); + // set output memory type argument + SetProtoArg(transpose_op_def, + OutputMemoryTypeTagName(), target_mem_type); + // update tensor consumer information + output_map->at(op_def->input(i)).consumer_op_indices.push_back( + target_net_def->op_size() - 1); + + // update output information map + output_map->emplace(transformed_name, InternalOutputInfo( + target_mem_type, input_info.dtype, dst_df, output_shape, + target_net_def->op_size() - 1)); + // update tensor shape map + tensor_shape_map->emplace(transformed_name, output_shape); + // record transformed tensors + transformed_set->insert(transformed_name); + } + // update original op_def's input + op_def->set_input(i, transformed_name); + return MaceStatus::MACE_SUCCESS; +} + std::string NetDefAdapter::DebugString(const NetDef *net_def) { std::stringstream sstream; auto DeviceTypeToStrFunc = [](DeviceType device_type) -> std::string { @@ -630,9 +687,9 @@ std::string NetDefAdapter::DebugString(const NetDef *net_def) { for (auto &op : net_def->op()) { std::string device_type = DeviceTypeToStrFunc( static_cast(op.device_type())); - std::string data_type = DataTypeToString(static_cast( - ProtoArgHelper::GetOptionalArg( - op, "T", static_cast(DT_FLOAT)))); + auto dt = ProtoArgHelper::GetOptionalArg( + op, "T", static_cast(DT_FLOAT)); + std::string data_type = DataTypeToString(static_cast(dt)); std::string mem_type = MemoryTypeToStrFunc( static_cast( ProtoArgHelper::GetOptionalArg( @@ -645,10 +702,10 @@ std::string NetDefAdapter::DebugString(const NetDef *net_def) { sstream << std::endl; sstream << "{" << std::endl; - sstream << " name: " << op.name() << std::endl; - sstream << " type: " << op.type() << std::endl; - sstream << " device: " << device_type << std::endl; - sstream << " data type: " << data_type << std::endl; + sstream << " name: " << op.name() << std::endl; + sstream << " type: " << op.type() << std::endl; + sstream << " device: " << device_type << std::endl; + sstream << " data type: " << data_type << std::endl; sstream << " data format: " << data_format << std::endl; sstream << " memory type: " << mem_type << std::endl; sstream << " inputs: ["; diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h index b285a4a5d913db3051e02fd93f8a50826e176d17..e1da6c22b6577b276bfc7458a340e0d87d4e1e8a 100644 --- a/mace/core/net_def_adapter.h +++ b/mace/core/net_def_adapter.h @@ -34,7 +34,6 @@ class OperatorDef; class OpRegistry; class Workspace; - /////////////////////////////////////////////////////////////////////////////// /// Conventions /// @@ -68,8 +67,8 @@ class NetDefAdapter { NetDef *target_net_def); public: - NetDefAdapter(const NetDefAdapter&) = delete; - NetDefAdapter(const NetDefAdapter&&) = delete; + NetDefAdapter(const NetDefAdapter &) = delete; + NetDefAdapter(const NetDefAdapter &&) = delete; NetDefAdapter &operator=(const NetDefAdapter &) = delete; NetDefAdapter &operator=(const NetDefAdapter &&) = delete; @@ -122,6 +121,15 @@ class NetDefAdapter { MemoryType *op_output_mem_types, NetDef *target_net_def); + std::vector GetDstDimsFromTransposeRuler( + TensorInfoMap *output_map, const OperatorDef *op_def, const int input_idx, + const DataFormat src_df, const DataFormat dst_df); + MaceStatus AddTranposeOpForDataFormat( + TensorInfoMap *output_map, TensorShapeMap *tensor_shape_map, + std::unordered_set *transformed_set, NetDef *target_net_def, + MemoryType target_mem_type, OperatorDef *op_def, const int i, + const DataFormat dst_df, const std::vector &dst_dims); + std::string DebugString(const NetDef *net_def); private: diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc index d679e9ef5f4bc6512f05f0e8445714b6793cc064..86159293c308972b381ab3ca97780cce4bd6d9f3 100644 --- a/mace/ops/common/conv_pool_2d_util.cc +++ b/mace/ops/common/conv_pool_2d_util.cc @@ -18,6 +18,9 @@ #include #include +#include "mace/core/ops/op_condition_builder.h" +#include "mace/core/registry/ops_registry.h" + namespace mace { namespace ops { @@ -435,5 +438,37 @@ void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, } } +#ifdef MACE_ENABLE_OPENCL +void SetFilterMemoryType(OpConditionContext *context, + OpenCLBufferType buffer_type) { + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (context->device()->device_type() == DeviceType::GPU) { + if (context->device()->gpu_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; + } else { + mem_type = MemoryType::GPU_BUFFER; + } + auto filter_tensor = context->workspace()->GetTensor( + context->operator_def()->input(1)); + if (filter_tensor == nullptr || !filter_tensor->is_weight()) { + context->SetInputOpenCLBufferType(1, buffer_type); + } + } + context->set_output_mem_type(mem_type); +} +#endif // MACE_ENABLE_OPENCL + +void RegisterFilterDataFormat(OpRegistry *op_registry, const char *op_name) { + auto builder = OpConditionBuilder(op_name).SetInputsDataFormatSelector( + [](OpConditionContext *context) -> std::vector { + DataFormat op_data_format = static_cast( + ProtoArgHelper::GetOptionalArg( + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); + return {op_data_format, DataFormat::OIHW, DataFormat::NONE}; + }); + MACE_REGISTER_OP_CONDITION(op_registry, builder); +} + } // namespace ops } // namespace mace diff --git a/mace/ops/common/conv_pool_2d_util.h b/mace/ops/common/conv_pool_2d_util.h index 389575d76a78b7154887865f203ee8c29f059a4d..e3ab122378237310434836de68521289ff94266e 100644 --- a/mace/ops/common/conv_pool_2d_util.h +++ b/mace/ops/common/conv_pool_2d_util.h @@ -16,6 +16,10 @@ #define MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_ #include + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_util.h" +#endif // MACE_ENABLE_OPENCL #include "mace/core/tensor.h" namespace mace { @@ -31,6 +35,9 @@ enum RoundType { CEIL = 1, }; +class OpConditionContext; +class OpRegistry; + namespace ops { void CalcPaddingAndOutputSize(const index_t *input_shape, @@ -98,6 +105,13 @@ void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, FrameworkType framework_type, DataFormat data_format); +#ifdef MACE_ENABLE_OPENCL +void SetFilterMemoryType(OpConditionContext *context, + OpenCLBufferType buffer_type); +#endif // MACE_ENABLE_OPENCL + +void RegisterFilterDataFormat(OpRegistry *op_registry, const char *op_name); + } // namespace ops } // namespace mace diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index d40770ea40475810bfdd2c5bb8450d4101c0ad08..03b098c65400331d3d57ec6402186e27e01825f5 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -445,33 +445,41 @@ class Conv2dOp : public ConvPool2dOpBase { kernel_ = make_unique(); } // Transform filter tensor to target format - if ((wino_block_size_ == 2 || wino_block_size_ == 4) && - (kernel_->CheckUseWinograd( - context->device()->gpu_runtime()->opencl_runtime(), - context->workspace()->GetTensor( - operator_def_->input(1))->shape(), - std::vector(operator_def_->output_shape(0).dims().begin(), - operator_def_->output_shape(0).dims().end()), - strides_.data(), - dilations_.data(), - &wino_block_size_))) { - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, - OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) - == MaceStatus::MACE_SUCCESS); + auto *filter_tensor = + context->workspace()->GetTensor(operator_def_->input(FILTER)); + if (filter_tensor != nullptr && filter_tensor->is_weight()) { + if ((wino_block_size_ == 2 || wino_block_size_ == 4) && + (kernel_->CheckUseWinograd( + context->device()->gpu_runtime()->opencl_runtime(), + filter_tensor->shape(), + std::vector(operator_def_->output_shape(0).dims().begin(), + operator_def_->output_shape(0).dims().end()), + strides_.data(), + dilations_.data(), + &wino_block_size_))) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) + == MaceStatus::MACE_SUCCESS); + } else { + wino_block_size_ = 0; + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + } } else { + // we don't know whether the kernal support winograd, so disable it. wino_block_size_ = 0; - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, - OpenCLBufferType::CONV2D_FILTER, mem_type) - == MaceStatus::MACE_SUCCESS); } + if (operator_def_->input_size() > 2) { - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) - == MaceStatus::MACE_SUCCESS); + auto ret = TransformFilter(context, operator_def_.get(), 2, + OpenCLBufferType::ARGUMENT, mem_type); + MACE_CHECK(ret == MaceStatus::MACE_SUCCESS); } } + MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); const Tensor *filter = this->Input(FILTER); @@ -506,6 +514,17 @@ void RegisterConv2D(OpRegistry *op_registry) { #endif // MACE_ENABLE_QUANTIZE MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("Conv2D").SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + SetFilterMemoryType(context, OpenCLBufferType::CONV2D_FILTER); + })); +#endif // MACE_ENABLE_OPENCL + + RegisterFilterDataFormat(op_registry, "Conv2D"); } } // namespace ops diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index a15395d2fc9c55009d650cb314d2e30abf3ced6c..4e0a60475ba9a52b06acef53224743c40f8ce0ee 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -154,10 +154,14 @@ class Deconv2dOp : public Deconv2dOpBase { } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, - OpenCLBufferType::CONV2D_FILTER, mem_type) - == MaceStatus::MACE_SUCCESS); + auto *filter_tensor = + context->workspace()->GetTensor(operator_def_->input(1)); + if (filter_tensor != nullptr && filter_tensor->is_weight()) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + } if (model_type_ == FrameworkType::TENSORFLOW) { if (operator_def_->input_size() >= 4) { MACE_CHECK(TransformFilter( @@ -238,34 +242,28 @@ void RegisterDeconv2D(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp); + #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OP_CONDITION( op_registry, - OpConditionBuilder("Deconv2D") - .SetInputMemoryTypeSetter( - [](OpConditionContext *context) -> void { - MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->device_type() == DeviceType::GPU) { - if (context->device()->gpu_runtime()->UseImageMemory()) { - mem_type = MemoryType::GPU_IMAGE; - } else { - MACE_NOT_IMPLEMENTED; - } - context->set_output_mem_type(mem_type); - FrameworkType framework_type = - static_cast( - ProtoArgHelper::GetOptionalArg( - *(context->operator_def()), "framework_type", - FrameworkType::TENSORFLOW)); - if (framework_type == FrameworkType::TENSORFLOW) { - context->SetInputInfo(2, MemoryType::CPU_BUFFER, - DataType::DT_INT32); - } - } else { - context->set_output_mem_type(mem_type); - } - })); + OpConditionBuilder("Deconv2D").SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + SetFilterMemoryType(context, OpenCLBufferType::DW_CONV2D_FILTER); + if (context->device()->device_type() == DeviceType::GPU) { + FrameworkType framework_type = + static_cast( + ProtoArgHelper::GetOptionalArg( + *(context->operator_def()), "framework_type", + FrameworkType::TENSORFLOW)); + if (framework_type == FrameworkType::TENSORFLOW) { + context->SetInputInfo(2, MemoryType::CPU_BUFFER, + DataType::DT_INT32); + } + } + })); #endif // MACE_ENABLE_OPENCL + + RegisterFilterDataFormat(op_registry, "Deconv2D"); } } // namespace ops diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 2d52af6a4060a7307b98b2a5ab1b7570ede29ee0..903a2cfd6d082eb992627d8bd898852b9b3c8e79 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -417,38 +417,13 @@ void RegisterDepthwiseConv2d(OpRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OP_CONDITION( op_registry, - OpConditionBuilder("DepthwiseConv2d") - .SetInputMemoryTypeSetter( - [](OpConditionContext *context) -> void { - MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->device_type() == DeviceType::GPU) { - if (context->device()->gpu_runtime()->UseImageMemory()) { - mem_type = MemoryType::GPU_IMAGE; - } else { - mem_type = MemoryType::GPU_BUFFER; - } - auto filter_tensor = context->workspace()->GetTensor( - context->operator_def()->input(1)); - if (filter_tensor == nullptr || !filter_tensor->is_weight()) { - context->SetInputOpenCLBufferType( - 1, OpenCLBufferType::DW_CONV2D_FILTER); - } - } - context->set_output_mem_type(mem_type); - })); + OpConditionBuilder("DepthwiseConv2d").SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + SetFilterMemoryType(context, OpenCLBufferType::DW_CONV2D_FILTER); + })); #endif // MACE_ENABLE_OPENCL - MACE_REGISTER_OP_CONDITION( - op_registry, - OpConditionBuilder("DepthwiseConv2d") - .SetInputsDataFormatSelector( - [](OpConditionContext *context) -> std::vector { - DataFormat op_data_format = - static_cast( - ProtoArgHelper::GetOptionalArg( - *context->operator_def(), "data_format", - static_cast(DataFormat::NONE))); - return {op_data_format, DataFormat::OIHW, DataFormat::NONE}; - })); + + RegisterFilterDataFormat(op_registry, "DepthwiseConv2d"); } } // namespace ops diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 5e1311076f27cfc8beff232dcae591aa818545c4..615b5eccc93a359eea915f5da541d4e57fff3b35 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -160,10 +160,13 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, - OpenCLBufferType::DW_CONV2D_FILTER, mem_type) - == MaceStatus::MACE_SUCCESS); + auto *filter_tensor = + context->workspace()->GetTensor(operator_def_->input(1)); + if (filter_tensor != nullptr && filter_tensor->is_weight()) { + auto ret = TransformFilter(context, operator_def_.get(), 1, + OpenCLBufferType::DW_CONV2D_FILTER, mem_type); + MACE_CHECK(ret == MaceStatus::MACE_SUCCESS); + } if (operator_def_->input_size() >= 3) { MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, @@ -223,6 +226,17 @@ void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) { DepthwiseDeconv2dOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("DepthwiseDeconv2d").SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + SetFilterMemoryType(context, OpenCLBufferType::DW_CONV2D_FILTER); + })); +#endif // MACE_ENABLE_OPENCL + + RegisterFilterDataFormat(op_registry, "DepthwiseDeconv2d"); } } // namespace ops diff --git a/mace/ops/extract_image_patches.cc b/mace/ops/extract_image_patches.cc new file mode 100644 index 0000000000000000000000000000000000000000..ab0c8de955c6b344be4c2646d8d9724e0893a72a --- /dev/null +++ b/mace/ops/extract_image_patches.cc @@ -0,0 +1,218 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for fused StatisticsExtraction and StatisticsPooling +// Components in Kaldi. +// This op is used to extract moving-average mean and standard-deviation +// statistics of input data. +// 'forward_indexes' indicates which frames of input will be used for +// extraction. +// save statistics results. +// 'forward_indexes' and 'count' were from precomputed index in kaldi. +// Reference to tools/extract_pooling.py and +// http://kaldi-asr.org/doc/nnet-general-component_8h_source.html#l00158 + +#include +#include + +#include "mace/core/ops/operator.h" +#include "mace/core/registry/ops_registry.h" +#include "mace/core/tensor.h" +#include "mace/ops/conv_pool_2d_base.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/image/extract_image_patches.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace ops { + +template +class ExtractImagePatchesOp; + +template +class ExtractImagePatchesOp : public ConvPool2dOpBase { + public: + explicit ExtractImagePatchesOp(OpConstructContext *context) + : ConvPool2dOpBase(context), + kernels_(Operation::GetRepeatedArgs("kernels")) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input_tensor = this->Input(0); + Tensor *output_tensor = this->Output(0); + std::vector output_shape(4); + std::vector filter_shape = { + input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]}; + + std::vector paddings(2); + if (paddings_.empty()) { + ops::CalcNCHWPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input_tensor->shape().data(), filter_shape.data(), + paddings_.data(), dilations_.data(), strides_.data(), + RoundType::FLOOR, output_shape.data()); + } + output_shape[1] *= kernels_[0] * kernels_[1]; + + MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input_tensor); + Tensor::MappingGuard output_guard(output_tensor); + const T *input = input_tensor->data(); + MACE_CHECK(output_tensor->dtype() == DataTypeToEnum::value); + T *output = output_tensor->mutable_data(); + const index_t *input_shape = input_tensor->shape().data(); + int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; + + return ExtractImagePatches(context, input, input_shape, output_shape.data(), + kernels_.data(), strides_.data(), + dilations_.data(), pad_hw, output); + } + + private: + MaceStatus ExtractImagePatches(const OpContext *context, + const T *input, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, + T *output) { + const index_t batch = out_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t in_c = c % in_channels; + const index_t filter_idx = c / in_channels; + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + in_c * in_image_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + index_t out_offset = out_base + h * out_width + w; + index_t fh = filter_idx / filter_hw[1]; + index_t fw = filter_idx % filter_hw[1]; + index_t inh = h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + output[out_offset] = input[input_offset]; + } else { + output[out_offset] = 0; + } + } + } + } + } + }, 0, batch, 1, 0, out_channels, 1); + + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector kernels_; + + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + +#ifdef MACE_ENABLE_OPENCL +template<> +class ExtractImagePatchesOp : public ConvPool2dOpBase { + public: + explicit ExtractImagePatchesOp(OpConstructContext *context) + : ConvPool2dOpBase(context), + kernels_(Operation::GetRepeatedArgs("kernels")) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { + kernel_ = make_unique(); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + return kernel_->Compute(context, input, kernels_.data(), strides_.data(), + padding_type_, paddings_, dilations_.data(), + output); + } + + private: + std::vector kernels_; + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterExtractImagePatches(OpRegistry *op_registry) { + MACE_REGISTER_OP(op_registry, "ExtractImagePatches", ExtractImagePatchesOp, + DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ExtractImagePatches", + ExtractImagePatchesOp, DeviceType::CPU); + MACE_REGISTER_GPU_OP(op_registry, "ExtractImagePatches", + ExtractImagePatchesOp); + + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("ExtractImagePatches").SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return {DeviceType::CPU, DeviceType::GPU}; + } + auto kernels = ProtoArgHelper::GetRepeatedArgs( + *op, "kernels"); + auto &output_shape = op->output_shape(0); + auto &output_dims = output_shape.dims(); + auto in_channel = output_dims[3] / kernels[0] / kernels[1]; + if (output_shape.dims_size() != 4 || in_channel % 4 != 0) { + return {DeviceType::CPU}; + } +#ifdef MACE_ENABLE_OPENCL + if (context->device()->device_type() == DeviceType::GPU) { + auto opencl_runtime = + context->device()->gpu_runtime()->opencl_runtime(); + auto max_2d_size = opencl_runtime->GetMaxImage2DSize(); + auto image_width = output_dims[2] * output_dims[3] / 4; + if (image_width > static_cast(max_2d_size[0])) { + return {DeviceType::CPU}; + } + } +#endif // MACE_ENABLE_OPENCL + + return {DeviceType::CPU, DeviceType::GPU}; + })); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/cl/deconv_2d.cl b/mace/ops/opencl/cl/deconv_2d.cl index 2837e5c78c1d009fce12891a8c642c1a25910763..ef7c70250fb0ea080b011c5349dbe98c931bf417 100644 --- a/mace/ops/opencl/cl/deconv_2d.cl +++ b/mace/ops/opencl/cl/deconv_2d.cl @@ -85,6 +85,7 @@ __kernel void deconv_2d(OUT_OF_RANGE_PARAMS for (int f_y = f_start_y, idx_h = start_y ; f_y >= 0; f_y -= stride_h, ++idx_h) { index_y = mad24(b, in_height, idx_h); in_pos.y = select(index_y, -1, idx_h < 0 || idx_h >= in_height); +#pragma unroll for (int f_x = f_start_x, idx_w = start_x; f_x >= 0; f_x -= stride_w, ++idx_w) { f_pos_y = mad24(f_y, kernel_w, f_x); f_pos_y = mad24(c, kernel_size, f_pos_y); diff --git a/mace/ops/opencl/cl/extract_image_patches.cl b/mace/ops/opencl/cl/extract_image_patches.cl new file mode 100644 index 0000000000000000000000000000000000000000..b41d3af3a58015c4c8da60b2a9251d6ab23dd372 --- /dev/null +++ b/mace/ops/opencl/cl/extract_image_patches.cl @@ -0,0 +1,56 @@ +#include + +__kernel void extract_image_patches(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int pad_top, + __private const int pad_left, + __private const int stride_h, + __private const int stride_w, + __private const int kernel_h, + __private const int kernel_w, + __write_only image2d_t output) { + + const int out_chan_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 + || out_hb_idx >= global_size_dim2) { + return; + } +#endif + const int out_width = global_size_dim1; + const int kernel_size = kernel_h * kernel_w; + const int in_channel = global_size_dim0 / kernel_size; + + const int n_b = out_hb_idx / out_height; + const int mod_b = out_hb_idx - mul24(n_b, out_height); + const int in_batch_base = mul24(n_b, in_height); + const int in_height_start = mad24(mod_b, stride_h, -pad_top); + const int in_width_start = mad24(out_width_idx, stride_w, -pad_left); + const int in_chan_idx = out_chan_idx % in_channel; + const int in_channel_base = mul24(in_chan_idx, in_width); + + const int kernel_base = out_chan_idx / in_channel; + const int kernel_h_idx = kernel_base / kernel_w; + const int kernel_w_idx = kernel_base % kernel_w; + + int in_height_idx = in_height_start + kernel_h_idx; + in_height_idx = select(in_batch_base + in_height_idx, -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + int in_width_idx = in_width_start + kernel_w_idx; + in_width_idx = select(in_channel_base + in_width_idx, -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + const int pos = mad24(out_chan_idx, out_width, out_width_idx); + if (in_height_idx != -1 && in_width_idx != -1) { + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + WRITE_IMAGET(output, (int2)(pos, out_hb_idx), in); + } else { + WRITE_IMAGET(output, (int2)(pos, out_hb_idx), 0); + } +} diff --git a/mace/ops/opencl/extract_image_patches.h b/mace/ops/opencl/extract_image_patches.h new file mode 100644 index 0000000000000000000000000000000000000000..e5d629af980119da435d6053e64ec6dbf165fe77 --- /dev/null +++ b/mace/ops/opencl/extract_image_patches.h @@ -0,0 +1,44 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_EXTRACT_IMAGE_PATCHES_H_ +#define MACE_OPS_OPENCL_EXTRACT_IMAGE_PATCHES_H_ + +#include + +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { + +class OpContext; +class Tensor; +namespace ops { +class OpenCLExtractImagePatchesKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLExtractImagePatchesKernel); +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_EXTRACT_IMAGE_PATCHES_H_ diff --git a/mace/ops/opencl/image/extract_image_patches.cc b/mace/ops/opencl/image/extract_image_patches.cc new file mode 100644 index 0000000000000000000000000000000000000000..8b426c03ad276bd4df68e53cbf01974a8c8f4def --- /dev/null +++ b/mace/ops/opencl/image/extract_image_patches.cc @@ -0,0 +1,124 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/extract_image_patches.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ExtractImagePatchesKernel::Compute( + OpContext *context, + const Tensor *input, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output) { + MACE_CHECK(dilations[0] == 1 && dilations[1] == 1, + "ExtractImagePatches opencl kernel not support dilation yet"); + MACE_CHECK(input->dim(3) % 4 == 0, "Only support channel % 4 == 0"); + + std::vector output_shape(4); + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels[0], kernels[1]}; + + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + output_shape[3] *= kernels[0] * kernels[1]; + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("extract_image_patches"); + built_options.emplace("-Dextract_image_patches=" + kernel_name); + + if (input->dtype() == output->dtype()) { + auto data_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("extract_image_patches", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output->dim(3))), + static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)), + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, paddings[0] / 2); + kernel_.setArg(idx++, paddings[1] / 2); + kernel_.setArg(idx++, strides[0]); + kernel_.setArg(idx++, strides[1]); + kernel_.setArg(idx++, kernels[0]); + kernel_.setArg(idx++, kernels[1]); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("extract_image_patches_opencl_kernel_", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/extract_image_patches.h b/mace/ops/opencl/image/extract_image_patches.h new file mode 100644 index 0000000000000000000000000000000000000000..eb34a5010967ddfb21d8cb390637786df93e3311 --- /dev/null +++ b/mace/ops/opencl/image/extract_image_patches.h @@ -0,0 +1,57 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_IMAGE_EXTRACT_IMAGE_PATCHES_H_ +#define MACE_OPS_OPENCL_IMAGE_EXTRACT_IMAGE_PATCHES_H_ + +#include +#include +#include +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/runtime/opencl/opencl_helper.h" +#include "mace/core/tensor.h" +#include "mace/ops/opencl/extract_image_patches.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +class ExtractImagePatchesKernel : public OpenCLExtractImagePatchesKernel { + public: + MaceStatus Compute( + OpContext *context, + const Tensor *input, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_IMAGE_EXTRACT_IMAGE_PATCHES_H_ diff --git a/mace/ops/registry/ops_registry.cc b/mace/ops/registry/ops_registry.cc index 9135fa0019982b97a68460e0d727a116aa725858..1478386b196a4431cd8263b29763d0af071151d9 100644 --- a/mace/ops/registry/ops_registry.cc +++ b/mace/ops/registry/ops_registry.cc @@ -56,6 +56,7 @@ extern void RegisterPad(OpRegistry *op_registry); extern void RegisterPadContext(OpRegistry *op_registry); extern void RegisterPNorm(OpRegistry *op_registry); extern void RegisterPooling(OpRegistry *op_registry); +extern void RegisterExtractImagePatches(OpRegistry *op_registry); extern void RegisterReduce(OpRegistry *op_registry); extern void RegisterReplaceIndex(OpRegistry *op_registry); extern void RegisterPriorBox(OpRegistry *op_registry); @@ -136,6 +137,7 @@ void RegisterAllOps(OpRegistry *registry) { ops::RegisterPadContext(registry); ops::RegisterPNorm(registry); ops::RegisterPooling(registry); + ops::RegisterExtractImagePatches(registry); ops::RegisterReduce(registry); ops::RegisterReplaceIndex(registry); ops::RegisterPriorBox(registry); diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index db8e9730d2d28ef74505cdb809f7d2fc5dec2c3d..b2a82f0cd1e69bf22401cd553cf8d964769fbbe5 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -46,6 +46,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/eltwise.cl")) + unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/extract_image_patches.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/fully_connected.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lstmcell.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/matmul.cl")) diff --git a/test/ccunit/mace/ops/extract_image_patches_test.cc b/test/ccunit/mace/ops/extract_image_patches_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0776ae0c06cddc1335d081a6b0f83acbc1a6c5a --- /dev/null +++ b/test/ccunit/mace/ops/extract_image_patches_test.cc @@ -0,0 +1,296 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class ExtractImagePatchesOpTest : public OpsTestBase {}; + +TEST_F(ExtractImagePatchesOpTest, VALID) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); + + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + + // Check + auto expected = + net.CreateTensor({1, 2, 2, 8}, + {0, 16, 1, 17, 4, 20, 5, 21, 2, 18, 3, 19, 6, 22, 7, 23, + 8, 24, 9, 25, 12, 28, 13, 29, 10, 26, 11, 27, 14, 30, 15, 31}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ExtractImagePatchesOpTest, SAME) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray("Input", {1, 3, 3, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8}); + + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + + // Check + auto expected = net.CreateTensor( + {1, 2, 2, 4}, {0, 1, 3, 4, 2, 0, 5, 0, 6, 7, 0, 0, 8, 0, 0, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ExtractImagePatchesOpTest, VALID_DILATION) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 4, 4, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {2, 2}) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + + // Check + auto expected = net.CreateTensor( + {1, 2, 2, 4}, {0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ExtractImagePatchesOpTest, k2x2s2x2) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 2, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); + + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + + // Check + auto expected = net.CreateTensor( + {1, 1, 5, 4}, + {0, 1, 9, 10, 2, 3, 11, 12, 4, 5, 13, 14, 6, 7, 15, 16, 8, 0, 17, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +namespace { +template +void SimpleExtractImagePatches3S2() { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); + + if (D == DeviceType::CPU) { + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + // Run + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + } else if (D == DeviceType::GPU) { + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + } + + // Check + auto expected = net.CreateTensor({1, 1, 4, 9}, + {0, 1, 2, 9, 10, 11, 18, 19, 20, + 2, 3, 4, 11, 12, 13, 20, 21, 22, + 4, 5, 6, 13, 14, 15, 22, 23, 24, + 6, 7, 8, 15, 16, 17, 24, 25, 26}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} +} // namespace + +TEST_F(ExtractImagePatchesOpTest, CPUSimpleExtractImagePatches3S2) { + SimpleExtractImagePatches3S2(); +} + +namespace { +template +void ExtractImagePatches3S2(const std::vector &input_shape, + const std::vector &strides, + Padding padding) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", input_shape); + + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + // run on cpu + net.RunOp(); + + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); + + OpDefBuilder("ExtractImagePatches", "ExtractImagePatchesTest") + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + + if (DataTypeToEnum::value == DT_HALF) { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3, + 1e-4); + } else { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } +} +} // namespace + +TEST_F(ExtractImagePatchesOpTest, OPENCLAlignedExtractImagePatches3S2) { + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 1}, Padding::SAME); + ExtractImagePatches3S2({3, 63, 31, 32}, {2, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 65, 27, 32}, {2, 1}, Padding::SAME); +} + +TEST_F(ExtractImagePatchesOpTest, OPENCLHalfAlignedExtractImagePatches3S2) { + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); + ExtractImagePatches3S2({3, 64, 32, 32}, {2, 1}, Padding::SAME); + ExtractImagePatches3S2({3, 63, 31, 32}, {2, 2}, Padding::VALID); + ExtractImagePatches3S2({3, 65, 27, 32}, {2, 1}, Padding::SAME); +} + + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py index 4c23e65fbf9efa190c6f8e87e86a507f92b3af8e..7c511505233a874e3b4dc82a4c76ec756c935f44 100644 --- a/tools/python/transform/base_converter.py +++ b/tools/python/transform/base_converter.py @@ -109,6 +109,7 @@ MaceSupportedOps = [ 'Dequantize', 'Eltwise', 'ExpandDims', + 'ExtractImagePatches', 'ExtractPooling', 'Fill', 'FullyConnected', @@ -173,6 +174,7 @@ MaceFixedDataFormatOps = [MaceOp.BatchNorm, MaceOp.DepthwiseDeconv2d, MaceOp.FullyConnected, MaceOp.Pooling, + MaceOp.ExtractImagePatches, MaceOp.ResizeBicubic, MaceOp.ResizeBilinear, MaceOp.ResizeNearestNeighbor, diff --git a/tools/python/transform/tensorflow_converter.py b/tools/python/transform/tensorflow_converter.py index 98b67f63c4669693334bbb0f166e55a6b2c28682..29d20b7f495685ab18574acc4f34d9fededc1a7a 100644 --- a/tools/python/transform/tensorflow_converter.py +++ b/tools/python/transform/tensorflow_converter.py @@ -72,6 +72,7 @@ TFSupportedOps = [ 'Div', 'Equal', 'ExpandDims', + 'ExtractImagePatches', 'FakeQuantWithMinMaxVars', 'FakeQuantWithMinMaxArgs', 'Fill', @@ -231,6 +232,8 @@ class TensorflowConverter(base_converter.ConverterInterface): TFOpType.Div.name: self.convert_elementwise, TFOpType.Equal.name: self.convert_elementwise, TFOpType.ExpandDims.name: self.convert_expand_dims, + TFOpType.ExtractImagePatches.name: + self.convert_extract_image_patches, TFOpType.FakeQuantWithMinMaxVars.name: self.convert_fake_quantize, TFOpType.FakeQuantWithMinMaxArgs.name: self.convert_fake_quantize, TFOpType.Fill.name: self.convert_fill, @@ -714,6 +717,27 @@ class TensorflowConverter(base_converter.ConverterInterface): kernels_arg.name = MaceKeyword.mace_kernel_str kernels_arg.ints.extend(tf_op.get_attr(tf_kernel_str)[1:3]) + def convert_extract_image_patches(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.ExtractImagePatches.name + + padding_arg = op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = self.padding_mode[tf_op.get_attr(tf_padding_str)].value + + strides_arg = op.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend(tf_op.get_attr(tf_strides_str)[1:3]) + + dilation_arg = op.arg.add() + dilation_arg.name = MaceKeyword.mace_dilations_str + dilations = tf_op.get_attr('rates')[1:3] + dilation_arg.ints.extend(dilations) + + kernels_arg = op.arg.add() + kernels_arg.name = MaceKeyword.mace_kernel_str + kernels_arg.ints.extend(tf_op.get_attr('ksizes')[1:3]) + def convert_softmax(self, tf_op): op = self.convert_general_op(tf_op) op.type = MaceOp.Softmax.name diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py index c35973a0622ad84f6407bfb4c6a8adde8bcb42a6..2d0c7e98a196f6b901d2c18f100089a80d2eb161 100644 --- a/tools/python/transform/transformer.py +++ b/tools/python/transform/transformer.py @@ -1218,6 +1218,7 @@ class Transformer(base_converter.ConverterInterface): for op in net.op: if op.type in [MaceOp.Deconv2D.name, MaceOp.DepthwiseDeconv2d] \ + and op.input[1] in self._consts \ and op.input[1] not in transposed_deconv_filter: filter = self._consts[op.input[1]] filter_data = np.array(filter.float_data).reshape( diff --git a/tools/python/utils/device.py b/tools/python/utils/device.py index 9037d0c882f26d5962df68f44e2cf97194237ea3..53d2535804857be857de0a6ae1ae7dabe350c87d 100644 --- a/tools/python/utils/device.py +++ b/tools/python/utils/device.py @@ -36,10 +36,11 @@ def execute(cmd, verbose=True): universal_newlines=True) if not verbose: - # use p.communicate instead of p.wait to avoid such situation: pipe is filled and the child process is blocked. + # use p.communicate instead of p.wait to avoid such situation: + # pipe is filled and the child process is blocked. out, err = p.communicate() if p.returncode != 0: - raise Exception("errorcode: {}".format(p.returncode) ) + raise Exception("errorcode: {}".format(p.returncode)) return out buf = []