diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fc705a53d1b8c266d1c6104982fda5caf33426a..9714b1456a1ecdc227b5861d114f09f7464cae97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA) enable_language(CUDA) endif(MACE_ENABLE_CUDA) -if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)) +if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA) if(ANDROID_ABI STREQUAL "arm64-v8a") # Use gold linker to avoid linking check of libcdsprpc.so set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold") diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md index 3e4616717767f46894f461d25abe599561639a91..2bf0af810845070f77ac174bcbfb7ccfc8f40113 100644 --- a/docs/development/adding_a_new_op.md +++ b/docs/development/adding_a_new_op.md @@ -33,8 +33,8 @@ class MyCustomOp : public Operation { } #ifdef MACE_ENABLE_OPENCL -template -class MyCustomOp : public Operation { +template<> +class MyCustomOp : public Operation { ... }; #endif // MACE_ENABLE_OPENCL @@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp); } } // namespace ops diff --git a/mace/codegen/BUILD.bazel b/mace/codegen/BUILD.bazel index 0e5bad98a70cfdb6ebe37fc8112c535ef0ca6e8b..a2a750156f7efd9127d71045b20d3fdd72fcf37f 100644 --- a/mace/codegen/BUILD.bazel +++ b/mace/codegen/BUILD.bazel @@ -5,7 +5,7 @@ package( default_visibility = ["//visibility:public"], ) -load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule") +load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule") cc_library( name = "generated_models", @@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule() cc_library( name = "generated_opencl", srcs = ["opencl/encrypt_opencl_kernel.cc"], + hdrs = ["opencl/encrypt_opencl_kernel.h"], copts = [ "-Werror", "-Wextra", diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 605ae3a759b9beae2d930263f20316490c15fd1b..883bc1eb828faaeeda015402d1f9f40059f28d5c 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -318,7 +318,7 @@ std::unique_ptr OpRegistryBase::CreateOperation( std::string key = OpKeyBuilder(op_type) .Device(device_type) - .TypeConstraint("T", dtype) + .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype) .Build(); if (registry_.at(op_type)->creators.count(key) == 0) { LOG(FATAL) << "Key not registered: " << key; diff --git a/mace/core/operator.h b/mace/core/operator.h index 9430d90d05be00ac2ae1e7034c4ea3f8c5dadfe2..fbcbfd2ead3f8d70552464420f450fae17b04b0a 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -39,7 +39,7 @@ class OpConditionContext { OpConditionContext(const Workspace *ws, TensorShapeMap *info); ~OpConditionContext() = default; - void set_operator_def(const OperatorDef* operator_def); + void set_operator_def(const OperatorDef *operator_def); inline const OperatorDef *operator_def() const { return operator_def_; @@ -49,7 +49,7 @@ class OpConditionContext { return ws_; } - inline void set_device(Device* device) { + inline void set_device(Device *device) { device_ = device; } @@ -110,7 +110,7 @@ class OpConstructContext { return ws_; } - inline void set_device(Device* device) { + inline void set_device(Device *device) { device_ = device; } @@ -166,14 +166,14 @@ class Operation { explicit Operation(OpConstructContext *context); virtual ~Operation() = default; - template + template inline T GetOptionalArg(const std::string &name, const T &default_value) const { MACE_CHECK(operator_def_, "operator_def was null!"); return ProtoArgHelper::GetOptionalArg( *operator_def_, name, default_value); } - template + template inline std::vector GetRepeatedArgs( const std::string &name, const std::vector &default_value = {}) const { MACE_CHECK(operator_def_, "operator_def was null!"); @@ -240,7 +240,6 @@ class Operation { #define MACE_OP_OUTPUT_TAGS(first_input, ...) \ enum _OutputTags { first_input = 0, __VA_ARGS__ } - struct OpRegistrationInfo { public: typedef std::function(OpConstructContext *)> @@ -290,7 +289,6 @@ class OpConditionBuilder { OpRegistrationInfo::DataFormatSelector data_format_selector_; }; - class OpRegistryBase { public: OpRegistryBase() = default; @@ -315,7 +313,7 @@ class OpRegistryBase { OpConstructContext *context, DeviceType device_type) const; - template + template static std::unique_ptr DefaultCreator( OpConstructContext *context) { return std::unique_ptr(new DerivedType(context)); @@ -334,6 +332,24 @@ class OpRegistryBase { DataTypeToEnum
::value, \ OpRegistryBase::DefaultCreator>) +#define MACE_REGISTER_OP_BY_CLASS( \ + op_registry, op_type, class_name, device, dt) \ + op_registry->Register(op_type, \ + device, \ + DataTypeToEnum
::value, \ + OpRegistryBase::DefaultCreator) + +#ifdef MACE_ENABLE_OPENCL +#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \ + op_registry->Register( \ + op_type, \ + DeviceType::GPU, \ + DT_FLOAT, \ + OpRegistryBase::DefaultCreator>) +#else +#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) +#endif + #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \ op_registry->Register(builder) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 022010246da6c59b6bf29da2acfe88b98fabf9be..4875cc228c00effeff3d12d676df410103ae16d2 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -18,20 +18,19 @@ #include #include #include // NOLINT(build/c++11) +#include #include #include #include -#include "mace/utils/macros.h" +#include "mace/codegen/opencl/encrypt_opencl_kernel.h" #include "mace/core/kv_storage.h" #include "mace/core/runtime/opencl/opencl_extension.h" +#include "mace/utils/macros.h" #include "mace/utils/tuner.h" namespace mace { -extern const std::map> - kEncryptedProgramMap; - const std::string OpenCLErrorToString(cl_int error) { switch (error) { case CL_SUCCESS: @@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime( const GPUPriorityHint priority_hint, const GPUPerfHint perf_hint, std::shared_ptr precompiled_binary_storage, - std::shared_ptr> tuner): + std::shared_ptr> tuner) : cache_storage_(cache_storage), precompiled_binary_storage_(precompiled_binary_storage), tuner_(tuner), @@ -332,7 +331,7 @@ OpenCLRuntime::OpenCLRuntime( cl_int err; if (gpu_type_ == GPUType::QUALCOMM_ADRENO - && opencl_version_ == OpenCLVersion::CL_VER_2_0) { + && opencl_version_ == OpenCLVersion::CL_VER_2_0) { std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, @@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime( #if CL_HPP_TARGET_OPENCL_VERSION >= 200 if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) { std::vector context_properties = { - CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(), - CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback, + CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(), + CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback, CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0 }; context_ = std::shared_ptr( @@ -399,7 +398,7 @@ OpenCLRuntime::OpenCLRuntime( if (cached_binary_platform_info != platform_info_) { if (precompiled_binary_storage_ == nullptr) { VLOG(1) << "There is no precompiled OpenCL binary in" - " all OpenCL binary paths."; + " all OpenCL binary paths."; } else { if (precompiled_binary_storage_->Load() != 0) { LOG(WARNING) << "Load OpenCL precompiled kernel file failed. " @@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( return true; } +MaceStatus GetProgramSourceByName(const std::string &program_name, + std::string *source) { + MACE_CHECK_NOTNULL(source); + std::stringstream source_stream; + const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap; + const auto &it_program = kEncryptedProgramMap.find(program_name); + if (it_program == kEncryptedProgramMap.end()) { + LOG(ERROR) << "Find program " << program_name << " failed."; + return MaceStatus::MACE_RUNTIME_ERROR; + } + + const std::vector &headers = it_program->second.headers_; + for (const std::string &header : headers) { + const auto &header_program = kEncryptedProgramMap.find(header); + if (header_program == kEncryptedProgramMap.end()) { + LOG(WARNING) << "Program header(" << header << ") is empty."; + continue; + } + + const auto &header_source = header_program->second.encrypted_code_; + source_stream << ObfuscateString( + std::string(header_source.begin(), header_source.end())); + } + + const auto &it_source = it_program->second.encrypted_code_; + source_stream << ObfuscateString( + std::string(it_source.begin(), it_source.end())); + *source = source_stream.str(); + + return MaceStatus::MACE_SUCCESS; +} + bool OpenCLRuntime::BuildProgramFromSource( const std::string &program_name, const std::string &built_program_key, const std::string &build_options_str, cl::Program *program) { - // Find from source - auto it_source = kEncryptedProgramMap.find(program_name); - if (it_source != kEncryptedProgramMap.end()) { + std::string kernel_source; + MaceStatus status = GetProgramSourceByName(program_name, &kernel_source); + if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) { cl::Program::Sources sources; - std::string source(it_source->second.begin(), it_source->second.end()); - std::string kernel_source = ObfuscateString(source); sources.push_back(kernel_source); *program = cl::Program(context(), sources); cl_int ret = program->build({device()}, build_options_str.c_str()); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 6b566d181ff3a7074be3e31ef2eb5ed725bf30d7..500b84eff39c4d9e8cd578a2b90949bc7524d27f 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { *net_def, "opencl_mem_type", static_cast(MemoryType::GPU_IMAGE)); const MemoryType mem_type = static_cast(mem_type_i); - runtime->set_mem_type(mem_type); return MaceStatus::MACE_SUCCESS; diff --git a/mace/mace.bzl b/mace/mace.bzl index cef0a5d225a46de6357c9db3bb464fee899040be..47d44edb38e90ebf61f6c1ed9d2dcff23126214d 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -118,9 +118,21 @@ def mace_version_genrule(): ) def encrypt_opencl_kernel_genrule(): - native.genrule( - name = "encrypt_opencl_kernel_gen", - srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))], - outs = ["opencl/encrypt_opencl_kernel.cc"], - cmd = "cat $(SRCS) > $@;" - ) + srcs = [ + str(Label( + "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc", + )), + str(Label( + "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h", + )), + ] + outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"] + native.genrule( + name = "encrypt_opencl_kernel_gen", + srcs = srcs, + outs = outs, + cmd = " && ".join([ + "cat $(location %s) > $(location %s)" % (srcs[i], outs[i]) + for i in range(0, len(outs)) + ]), + ) diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index a80b556dda4d759c8be28cffcb8ed4c1c45fea52..9861198aaa49b99dec5302a0c934f2947e39fc7d 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -181,7 +181,6 @@ cc_library( ], ) - cc_library( name = "internal_ops", srcs = glob( @@ -239,10 +238,10 @@ cc_library( name = "ops", srcs = [ "registry/ops_registry.cc", - ], + ], hdrs = [ "registry/ops_registry.h", - ], + ], copts = [ "-Werror", "-Wextra", diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 6cb21b5c525ee0b6529348bcfcddd7acd9cfef7b..255370568b6eb7a8702900b85b0e2c99d4606a6b 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -83,28 +83,27 @@ class ActivationOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class ActivationOp : public Operation { +template<> +class ActivationOp : public Operation { public: explicit ActivationOp(OpConstructContext *context) : Operation(context) { ActivationType type = ops::StringToActivationType( Operation::GetOptionalArg("activation", "NOOP")); - auto relux_max_limit = static_cast( - Operation::GetOptionalArg("max_limit", 0.0f)); - auto leakyrelu_coefficient = static_cast( - Operation::GetOptionalArg("leakyrelu_coefficient", 0.0f)); + auto relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); + auto leakyrelu_coefficient = + Operation::GetOptionalArg("leakyrelu_coefficient", 0.0f); MemoryType mem_type; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>( + kernel_ = make_unique( type, relux_max_limit, leakyrelu_coefficient); } else { MACE_NOT_IMPLEMENTED; } if (type == ActivationType::PRELU) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -126,14 +125,7 @@ class ActivationOp : public Operation { void RegisterActivation(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, DeviceType::CPU, float); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Activation") @@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 523557cffdec564ba9706c4279dd4f20f0d933a7..5b98ba8554caa69929adacefe27b94499d274cd9 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -29,10 +29,10 @@ namespace mace { namespace ops { -template +template class AddNOp; -template <> +template<> class AddNOp : public Operation { public: explicit AddNOp(OpConstructContext *context) @@ -62,13 +62,13 @@ class AddNOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class AddNOp : public Operation { +template<> +class AddNOp : public Operation { public: explicit AddNOp(OpConstructContext *context) : Operation(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } @@ -92,15 +92,9 @@ class AddNOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterAddN(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("AddN") @@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 4e303d07e79b1a5cc9d847720aede92de462f980..a27e46c5739428e6b08952db83f0dfce5b60e798 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -161,8 +161,8 @@ class BatchNormOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class BatchNormOp : public Operation { +template<> +class BatchNormOp : public Operation { public: explicit BatchNormOp(OpConstructContext *context) : Operation(context) { @@ -176,7 +176,7 @@ class BatchNormOp : public Operation { MemoryType mem_type; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>( + kernel_ = make_unique( epsilon, activation, relux_max_limit, leakyrelu_coefficient); } else { MACE_NOT_IMPLEMENTED; @@ -187,7 +187,7 @@ class BatchNormOp : public Operation { const Tensor *input_tensor = context->workspace()->GetTensor( operator_def_->input(i)); MACE_CHECK(input_tensor != nullptr); - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), i, @@ -235,14 +235,7 @@ class BatchNormOp : public Operation { void RegisterBatchNorm(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU, float); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp); } } // namespace ops diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index 03ac91ffb146d4e54c12d94497fb19bdec23337a..937387fc6be78587c0898a5ab5d00a3640b87d3b 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation { } }; -template +template class BatchToSpaceNDOp; -template <> +template<> class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) @@ -175,7 +175,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { } }; -template <> +template<> class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) @@ -259,13 +259,13 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { }; #ifdef MACE_ENABLE_OPENCL -template -class BatchToSpaceNDOp : public BatchToSpaceOpBase { +template<> +class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) : BatchToSpaceOpBase(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } @@ -285,7 +285,6 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp, DeviceType::CPU, float); @@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp, DeviceType::CPU, uint8_t); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "BatchToSpaceND", - BatchToSpaceNDOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "BatchToSpaceND", - BatchToSpaceNDOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp); } } // namespace ops diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 72e93fece0850710fd26aefab0cdddcddaedfc3e..f8c5b352d54bd80dd622d8fba8d5a81b8429a88b 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -34,16 +34,16 @@ namespace mace { namespace ops { -template +template class BiasAddOp; -template <> +template<> class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), - has_data_format_(Operation::GetOptionalArg("has_data_format", 0)) - {} + has_data_format_(Operation::GetOptionalArg("has_data_format", + 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -96,8 +96,8 @@ class BiasAddOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class BiasAddOp : public Operation { +template<> +class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), @@ -105,11 +105,11 @@ class BiasAddOp : public Operation { MemoryType mem_type = MemoryType::CPU_BUFFER; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -133,18 +133,10 @@ class BiasAddOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterBiasAdd(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU, float); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("BiasAdd") @@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index d68ebbbec9d8c03ee4045c92cf4258f9326dcca8..a7fababb3e9a2806d4de0eb4b9d91600c4180a30 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -23,10 +23,10 @@ namespace mace { namespace ops { -template +template class ChannelShuffleOp; -template +template class ChannelShuffleOp : public Operation { public: explicit ChannelShuffleOp(OpConstructContext *context) @@ -74,16 +74,15 @@ class ChannelShuffleOp : public Operation { const int groups_; }; - #ifdef MACE_ENABLE_OPENCL -template -class ChannelShuffleOp : public Operation { +template<> +class ChannelShuffleOp : public Operation { public: explicit ChannelShuffleOp(OpConstructContext *context) : Operation(context) { const int groups = Operation::GetOptionalArg("group", 1); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(groups); + kernel_ = make_unique(groups); } else { MACE_NOT_IMPLEMENTED; } @@ -99,18 +98,11 @@ class ChannelShuffleOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterChannelShuffle(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "ChannelShuffle", ChannelShuffleOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "ChannelShuffle", - ChannelShuffleOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "ChannelShuffle", - ChannelShuffleOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp); MACE_REGISTER_OP_CONDITION( op_registry, @@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int groups = ProtoArgHelper::GetOptionalArg( *op, "group", 1); if (op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } index_t channels = op->output_shape(0).dims(3); index_t channels_per_group = channels / groups; if (groups % 4 != 0 || channels_per_group % 4 != 0) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/pad.h b/mace/ops/common/pad_type.h similarity index 87% rename from mace/ops/pad.h rename to mace/ops/common/pad_type.h index e2139e27e0ae319a8ebe4a441eebc5e53187b965..e244b5e6cbd5fcf1354c7e625b83f60abebb3d56 100644 --- a/mace/ops/pad.h +++ b/mace/ops/common/pad_type.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_PAD_H_ -#define MACE_OPS_PAD_H_ +#ifndef MACE_OPS_COMMON_PAD_TYPE_H_ +#define MACE_OPS_COMMON_PAD_TYPE_H_ namespace mace { namespace ops { @@ -27,4 +27,4 @@ enum PadType { } // namespace ops } // namespace mace -#endif // MACE_OPS_PAD_H_ +#endif // MACE_OPS_COMMON_PAD_TYPE_H_ diff --git a/mace/ops/pooling.h b/mace/ops/common/pooling_type.h similarity index 85% rename from mace/ops/pooling.h rename to mace/ops/common/pooling_type.h index c49b2669975bf856d30c0d2cf6ab7deef01e09e1..c7adccbf4c2dabdea6f10d25b7a8e8ae4f1eecbc 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/common/pooling_type.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_POOLING_H_ -#define MACE_OPS_POOLING_H_ +#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_ +#define MACE_OPS_COMMON_POOLING_TYPE_H_ namespace mace { @@ -23,4 +23,4 @@ enum PoolingType { }; } // namespace mace -#endif // MACE_OPS_POOLING_H_ +#endif // MACE_OPS_COMMON_POOLING_TYPE_H_ diff --git a/mace/ops/reduce.h b/mace/ops/common/reduce_type.h similarity index 86% rename from mace/ops/reduce.h rename to mace/ops/common/reduce_type.h index 2888bb721ff9fb9f55a28786593da988734f19de..667f6bece40be4bfb4d0594c9920bcdb6a3e0918 100644 --- a/mace/ops/reduce.h +++ b/mace/ops/common/reduce_type.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_REDUCE_H_ -#define MACE_OPS_REDUCE_H_ +#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_ +#define MACE_OPS_COMMON_REDUCE_TYPE_H_ namespace mace { @@ -28,4 +28,4 @@ enum ReduceType { }; } // namespace mace -#endif // MACE_OPS_REDUCE_H_ +#endif // MACE_OPS_COMMON_REDUCE_TYPE_H_ diff --git a/mace/ops/resize_bicubic.h b/mace/ops/common/utils.h similarity index 85% rename from mace/ops/resize_bicubic.h rename to mace/ops/common/utils.h index 97323b8665c1ada6b3c16e8e95ee52230f0350b8..06648942bb48492d946793401920cc246ae77b1a 100644 --- a/mace/ops/resize_bicubic.h +++ b/mace/ops/common/utils.h @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_RESIZE_BICUBIC_H_ -#define MACE_OPS_RESIZE_BICUBIC_H_ +#ifndef MACE_OPS_COMMON_UTILS_H_ +#define MACE_OPS_COMMON_UTILS_H_ #include "mace/core/types.h" namespace mace { namespace ops { -namespace resize_bicubic { +namespace common { +namespace utils { + constexpr int64_t kTableSize = (1u << 10); inline float CalculateResizeScale(index_t in_size, @@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size, ? (in_size - 1) / static_cast(out_size - 1) : in_size / static_cast(out_size); } -} // namespace resize_bicubic +} // namespace utils +} // namespace common } // namespace ops } // namespace mace -#endif // MACE_OPS_RESIZE_BICUBIC_H_ +#endif // MACE_OPS_COMMON_UTILS_H_ diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 518e9cc2b5b9b0d8ff54308e60bc5a3c55e52f42..ccdb0b2db551d2ce26121b38335918ddae306c68 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -46,10 +46,10 @@ class ConcatOpBase : public Operation { int axis_; }; -template +template class ConcatOp; -template +template class ConcatOp : public ConcatOpBase { public: explicit ConcatOp(OpConstructContext *context) @@ -194,13 +194,13 @@ class ConcatOp : public ConcatOpBase { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class ConcatOp : public ConcatOpBase { +template<> +class ConcatOp : public ConcatOpBase { public: explicit ConcatOp(OpConstructContext *context) : ConcatOpBase(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } @@ -215,7 +215,6 @@ class ConcatOp : public ConcatOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterConcat(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, DeviceType::CPU, float); @@ -228,51 +227,44 @@ void RegisterConcat(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, - DeviceType::GPU, half); - -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Concat") .SetDevicePlacerFunc( - [](OpConditionContext *context) -> std::set { - auto op = context->operator_def(); - if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; - } - auto tensor_shape_info = context->tensor_shape_info(); - if (op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; - } else { - int has_data_format = - ProtoArgHelper::GetOptionalArg( - *op, "has_data_format", 0); - int axis = ProtoArgHelper::GetOptionalArg( - *op, "axis", 3); - if (!has_data_format || axis != 3) { - return { DeviceType::CPU }; + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return {DeviceType::CPU, DeviceType::GPU}; } - bool divisible_four = true; - for (const std::string &input : op->input()) { - if (tensor_shape_info->find(input) - != tensor_shape_info->end()) { - divisible_four = divisible_four - && (tensor_shape_info->at(input)[3] % 4 == 0); + auto tensor_shape_info = context->tensor_shape_info(); + if (op->output_shape(0).dims_size() != 4) { + return {DeviceType::CPU}; + } else { + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); + int axis = ProtoArgHelper::GetOptionalArg( + *op, "axis", 3); + if (!has_data_format || axis != 3) { + return {DeviceType::CPU}; + } + bool divisible_four = true; + for (const std::string &input : op->input()) { + if (tensor_shape_info->find(input) + != tensor_shape_info->end()) { + divisible_four = divisible_four + && (tensor_shape_info->at(input)[3] % 4 == 0); + } + } + // Only support not divisible 4 case with 2 inputs. + if (op->input_size() > 2 && !divisible_four) { + return {DeviceType::CPU}; } } - // Only support not divisible 4 case with 2 inputs. - if (op->input_size() > 2 && !divisible_four) { - return { DeviceType::CPU }; - } - } - return { DeviceType::CPU, DeviceType::GPU }; - })); + return {DeviceType::CPU, DeviceType::GPU}; + })); } } // namespace ops diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 1963fc865af60c532754345278a9e0f85d9ebc38..c2666d073c370240e3945f166b4ce18a9d9dc0ff 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -446,8 +446,8 @@ class Conv2dOp : public ConvPool2dOpBase { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class Conv2dOp : public ConvPool2dOpBase { +template<> +class Conv2dOp : public ConvPool2dOpBase { public: explicit Conv2dOp(OpConstructContext *context) : ConvPool2dOpBase(context), @@ -461,10 +461,10 @@ class Conv2dOp : public ConvPool2dOpBase { MemoryType mem_type; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { mem_type = MemoryType::GPU_BUFFER; - kernel_ = make_unique>(); + kernel_ = make_unique(); } // Transform filter tensor to target format if ((wino_block_size_ == 2 || wino_block_size_ == 4) && @@ -477,19 +477,19 @@ class Conv2dOp : public ConvPool2dOpBase { strides_.data(), dilations_.data(), &wino_block_size_))) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) == MaceStatus::MACE_SUCCESS); } else { wino_block_size_ = 0; - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::CONV2D_FILTER, mem_type) == MaceStatus::MACE_SUCCESS); } if (operator_def_->input_size() > 2) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp); } } // namespace ops diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 20146c8d05eb728ae54711af0883da5cf6e38bca..acaa73f1cfe82834af09d098a7cfc2b12fe70880 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -24,10 +24,10 @@ namespace mace { namespace ops { -template +template class CropOp; -template +template class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) @@ -43,7 +43,6 @@ class CropOp : public Operation { } } - MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs."); @@ -71,7 +70,7 @@ class CropOp : public Operation { MACE_RETURN_IF_ERROR(output->Resize(output_shape)); T *output_data = output->mutable_data(); - const T * input_data = input0->data(); + const T *input_data = input0->data(); crop_copy(input_data, output_data, input0->shape(), output_shape, offsets.data()); @@ -80,10 +79,10 @@ class CropOp : public Operation { } private: - void crop_copy(const T* input_data, T* output_data, + void crop_copy(const T *input_data, T *output_data, const std::vector &input_shape, const std::vector &output_shape, - const int32_t* offsets) { + const int32_t *offsets) { const index_t out_img_size = output_shape[1] * output_shape[2] * output_shape[3]; const index_t out_hw = output_shape[2] * output_shape[3]; @@ -94,9 +93,9 @@ class CropOp : public Operation { for (int b = 0; b < output_shape[0]; ++b) { for (int c = 0; c < output_shape[1]; ++c) { for (int h = 0; h < output_shape[2]; ++h) { - T* out_ptr = + T *out_ptr = output_data + b * out_img_size + c * out_hw + h * output_shape[3]; - const T* in_ptr_bch = + const T *in_ptr_bch = input_data + (b + offsets[0]) * in_img_size + (c + offsets[1]) * in_hw + (h + offsets[2]) * input_shape[3] + offsets[3]; @@ -112,13 +111,13 @@ class CropOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class CropOp : public Operation { +template<> +class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) : Operation(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>( + kernel_ = make_unique( Operation::GetRepeatedArgs("offset")); } else { MACE_NOT_IMPLEMENTED; @@ -133,18 +132,10 @@ class CropOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterCrop(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::CPU, float); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Crop", CropOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Crop", CropOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Crop") @@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 2b7623e6d48cf5738bccbbed6c7cf30820342f19..6453544ae92c75efc5560ef5f157dcbbfedb13d5 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -167,30 +167,30 @@ class Deconv2dOp : public Deconv2dOpBase { }; #ifdef MACE_ENABLE_OPENCL -template -class Deconv2dOp : public Deconv2dOpBase { +template<> +class Deconv2dOp : public Deconv2dOpBase { public: explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::CONV2D_FILTER, mem_type) == MaceStatus::MACE_SUCCESS); if (model_type_ == FrameworkType::CAFFE) { if (operator_def_->input_size() >= 3) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } } else { if (operator_def_->input_size() >= 4) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 3, @@ -256,13 +256,8 @@ class Deconv2dOp : public Deconv2dOpBase { void RegisterDeconv2D(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float); - + MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp); #ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, - DeviceType::GPU, half); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Deconv2D") diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index a57ddecfae2ddbcc78b93d601382c3a2933fafac..ba87830a9038ac2c791787a148b114d0a5c0c8f6 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -24,7 +24,7 @@ namespace mace { namespace ops { -template +template class DepthToSpaceOp : public Operation { public: explicit DepthToSpaceOp(OpConstructContext *context) @@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class DepthToSpaceOp : public Operation { +template<> +class DepthToSpaceOp : public Operation { public: explicit DepthToSpaceOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(block_size); + kernel_ = make_unique(block_size); } else { MACE_NOT_IMPLEMENTED; } @@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "DepthToSpace", DepthToSpaceOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "DepthToSpace", - DepthToSpaceOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "DepthToSpace", - DepthToSpaceOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp); } } // namespace ops diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index d53b67463996e8a27b9d0af62227cbc0c8cdbc1e..06964ee038088d6921b5d9244eac3c14913522ae 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -369,24 +369,24 @@ class DepthwiseConv2dOp #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { +template<> +class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { MemoryType mem_type; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { mem_type = MemoryType::GPU_BUFFER; - kernel_ = make_unique>(); + kernel_ = make_unique(); } Tensor *filter_tensor = context->workspace()->GetTensor( operator_def_->input(1)); if (filter_tensor != nullptr && filter_tensor->is_weight()) { // Transform filter tensor to target format - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, @@ -394,7 +394,7 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { mem_type) == MaceStatus::MACE_SUCCESS); } if (operator_def_->input_size() > 2) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { DepthwiseConv2dOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", - DepthwiseConv2dOp, DeviceType::GPU, float); + MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp); - MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", - DepthwiseConv2dOp, DeviceType::GPU, half); +#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("DepthwiseConv2d") @@ -467,8 +464,8 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { DataFormat op_data_format = static_cast( ProtoArgHelper::GetOptionalArg( - *context->operator_def(), "data_format", - static_cast(DataFormat::NONE))); + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); return {op_data_format, DataFormat::OIHW, DataFormat::NONE}; })); } diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 31b634af11ed9756fbb14eddd91d519a7224d1d6..96f6d575fd2c8663d7c2c860dbbdbd7d0801713d 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp }; #ifdef MACE_ENABLE_OPENCL -template -class DepthwiseDeconv2dOp : public Deconv2dOpBase { +template<> +class DepthwiseDeconv2dOp : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::DW_CONV2D_FILTER, mem_type) == MaceStatus::MACE_SUCCESS); if (operator_def_->input_size() >= 3) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", - DepthwiseDeconv2dOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", - DepthwiseDeconv2dOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp); } } // namespace ops diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index c31d5e55a881bb89fde61001c938ba785012d8c0..f597f70c9682a372e28e6602f0b38fa065b9edec 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -1158,8 +1158,8 @@ class EltwiseOp : public Operation { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class EltwiseOp : public Operation { +template<> +class EltwiseOp : public Operation { public: explicit EltwiseOp(OpConstructContext *context) : Operation(context) { @@ -1178,7 +1178,7 @@ class EltwiseOp : public Operation { MemoryType mem_type; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>( + kernel_ = make_unique( type, coeff, scalar_input, scalar_input_index); } else { MACE_NOT_IMPLEMENTED; @@ -1190,14 +1190,14 @@ class EltwiseOp : public Operation { if (ws->HasTensor(operator_def_->input(i)) && ws->GetTensor(operator_def_->input(i))->is_weight()) { if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), i, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), i, @@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp); } } // namespace ops diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 9a371b16566c714cc8c352bc7b6a4b1382a9695e..d863a2843a493d3186021d6621f226fc89689e7b 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -184,27 +184,27 @@ class FullyConnectedOp #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class FullyConnectedOp : public FullyConnectedOpBase { +template<> +class FullyConnectedOp : public FullyConnectedOpBase { public: explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context) { MemoryType mem_type = MemoryType::CPU_BUFFER; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } // Transform filter tensor to target format - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 1, OpenCLBufferType::WEIGHT_WIDTH, mem_type) == MaceStatus::MACE_SUCCESS); if (operator_def_->input_size() > 2) { - MACE_CHECK(TransformFilter( + MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } @@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) { FullyConnectedOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "FullyConnected", - FullyConnectedOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "FullyConnected", - FullyConnectedOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp); } } // namespace ops diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc index 892cef297e10f41a30163c369f6a62a10768e454..1c7a037ee2b8c1ec445b8c638958209cde7792f0 100644 --- a/mace/ops/identity.cc +++ b/mace/ops/identity.cc @@ -18,7 +18,6 @@ namespace mace { namespace ops { -template class IdentityOp : public Operation { public: explicit IdentityOp(OpConstructContext *context) @@ -34,15 +33,13 @@ class IdentityOp : public Operation { }; void RegisterIdentity(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, - DeviceType::CPU, float); - MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, - DeviceType::CPU, int32_t); + MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, + DeviceType::CPU, float); + MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, + DeviceType::CPU, int32_t); #ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, - DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, - DeviceType::GPU, half); + MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, + DeviceType::GPU, float); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc index 38f711f57ad824f146a4cd0abf306300b5122735..fb7bfecc90ccb80d2cedaf321d65b207be988892 100644 --- a/mace/ops/infer_conv2d_shape.cc +++ b/mace/ops/infer_conv2d_shape.cc @@ -19,7 +19,6 @@ namespace mace { namespace ops { -template class InferConv2dShapeOp : public Operation { public: explicit InferConv2dShapeOp(OpConstructContext *context) @@ -66,20 +65,23 @@ class InferConv2dShapeOp : public Operation { int32_t out_h = 0, out_w = 0; if (!paddings.empty()) { out_h = (in_h - kernels[2] + paddings[0]) / strides[0] + 1; - out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1; + out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1; } else { switch (padding_type) { - case SAME: + case SAME: { out_h = (in_h + strides[0] - 1) / strides[0]; out_w = (in_w + strides[1] - 1) / strides[1]; break; - case VALID: + } + case VALID: { out_h = (in_h - kernels[2] + 1) / strides[0]; out_w = (in_w - kernels[3] + 1) / strides[1]; break; - default: + } + default: { MACE_NOT_IMPLEMENTED; break; + } } } @@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation { }; void RegisterInferConv2dShape(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "InferConv2dShape", - InferConv2dShapeOp, DeviceType::CPU, float); - MACE_REGISTER_OP(op_registry, "InferConv2dShape", - InferConv2dShapeOp, DeviceType::CPU, int32_t); + MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::CPU, float); + MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::CPU, int32_t); #ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "InferConv2dShape", - InferConv2dShapeOp, DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "InferConv2dShape", - InferConv2dShapeOp, DeviceType::GPU, half); + MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::GPU, float); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 592d25ae724ed8a93191049a31097a4e95c91d2a..f9bfec53cfb0127e123b50b65587dbf34399cd07 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -77,7 +77,7 @@ class MatMulOpBase : public Operation { } else { MACE_CHECK(lhs_rank == 2 || rhs_rank == 2, "Either lhs or rhs matrix should has rank 2 " - "for non-batched matrix multiplication"); + "for non-batched matrix multiplication"); } index_t @@ -492,8 +492,8 @@ class MatMulOp : public MatMulOpBase { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class MatMulOp : public MatMulOpBase { +template<> +class MatMulOp : public MatMulOpBase { public: explicit MatMulOp(OpConstructContext *context) : MatMulOpBase(context) { @@ -592,7 +592,6 @@ class MatMulOp : public MatMulOpBase { }; #endif // MACE_ENABLE_NEON - void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, float); @@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp); #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc index 58ae277ec1a493f0baa5d584089736a4f86aeb38..f3685b985196e8afe956652be2fa85c2f8769b8c 100644 --- a/mace/ops/opencl/buffer/buffer_transform.cc +++ b/mace/ops/opencl/buffer/buffer_transform.cc @@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output) { const index_t out_chan = input->dim(0); const index_t in_chan = input->dim(1); @@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter( MACE_OUT_OF_RANGE_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter"); built_options.emplace("-Dtransform_conv_filter=" + kernel_name); - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, built_options, @@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output) { const index_t multiplier = input->dim(0); const index_t in_chan = input->dim(1); @@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter( MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter"); built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name); - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, built_options, @@ -164,7 +164,6 @@ MaceStatus TransformArgument( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output) { const index_t size = input->dim(0); @@ -181,8 +180,9 @@ MaceStatus TransformArgument( MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg"); built_options.emplace("-Dtransform_arg=" + kernel_name); - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, built_options, @@ -229,6 +229,30 @@ MaceStatus TransformArgument( return MaceStatus::MACE_SUCCESS; } +MaceStatus BufferTransform::Compute(OpContext *context, + const Tensor *input, + const OpenCLBufferType type, + const int wino_blk_size, + Tensor *output) { + MACE_UNUSED(wino_blk_size); + switch (type) { + case CONV2D_FILTER: + return TransformConv2DFilter(context, &kernel_, input, output); + case DW_CONV2D_FILTER: + return TransformDWConv2DFilter(context, &kernel_, input, output); + case ARGUMENT: + return TransformArgument(context, &kernel_, input, output); + default: + if (input->dtype() != output->dtype()) { + return BufferTypeTransform(context, &kernel_, input, output); + } else { + SetFutureDefaultWaitFn(context->future()); + output->ReuseTensorBuffer(*input); + return MaceStatus::MACE_SUCCESS; + } + } +} + } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h index 762518047dfefeebe3665dcb8d327e09ccb6b17d..c32ccbb13069ea800aa30b2ac8cd8a2eb6cac2b5 100644 --- a/mace/ops/opencl/buffer/buffer_transform.h +++ b/mace/ops/opencl/buffer/buffer_transform.h @@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output); MaceStatus TransformConv2DFilter( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output); MaceStatus TransformDWConv2DFilter( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output); MaceStatus TransformArgument( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output); - -template -class BufferTransform: public OpenCLBufferTransformKernel { +class BufferTransform : public OpenCLBufferTransformKernel { public: MaceStatus Compute( OpContext *context, @@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel { std::vector input_shape_; }; -template -MaceStatus BufferTransform::Compute(OpContext *context, - const Tensor *input, - const OpenCLBufferType type, - const int wino_blk_size, - Tensor *output) { - MACE_UNUSED(wino_blk_size); - const DataType dt = DataTypeToEnum::value; - switch (type) { - case CONV2D_FILTER: - return TransformConv2DFilter(context, &kernel_, input, dt, output); - case DW_CONV2D_FILTER: - return TransformDWConv2DFilter(context, &kernel_, input, dt, output); - case ARGUMENT: - return TransformArgument(context, &kernel_, input, dt, output); - default: - if (input->dtype() != dt) { - return BufferTypeTransform(context, &kernel_, input, dt, output); - } else { - SetFutureDefaultWaitFn(context->future()); - output->ReuseTensorBuffer(*input); - return MaceStatus::MACE_SUCCESS; - } - } -} - } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc index 6899ba4053c7433c4340af91f9708387d2f02844..2cb3ae0043df20ddfa25421572db5377f0c12363 100644 --- a/mace/ops/opencl/buffer/buffer_type_transform.cc +++ b/mace/ops/opencl/buffer/buffer_type_transform.cc @@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform( OpContext *context, cl::Kernel *kernel, const Tensor *input, - const DataType dt, Tensor *output) { MACE_RETURN_IF_ERROR(output->ResizeLike(input)); @@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform( std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type"); built_options.emplace("-Dtransform_data_type=" + kernel_name); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype())); MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, built_options, diff --git a/mace/ops/opencl/buffer/conv_2d.cc b/mace/ops/opencl/buffer/conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..50109b6e2341f488ff39de17360d448dd238dc72 --- /dev/null +++ b/mace/ops/opencl/buffer/conv_2d.cc @@ -0,0 +1,170 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/buffer/conv_2d.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace buffer { + +bool Conv2dKernel::CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_block_size) { + MACE_UNUSED(kwg_size_); + MACE_UNUSED(runtime); + MACE_UNUSED(output_shape); + MACE_UNUSED(wino_block_size); + return (filter_shape[2] == 3 && filter_shape[3] == 3 && + strides[0] == 1 && strides[1] == 1 && + dilations[0] == 1 && dilations[1] == 1); +} + +MaceStatus Conv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + const int winograd_blk_size, + Tensor *output) { + MACE_UNUSED(winograd_blk_size); + StatsFuture pad_future, conv_future; + index_t filter_h = filter->dim(2); + index_t filter_w = filter->dim(3); + // Reshape output + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter->shape().data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // calculate padded input shape + index_t width = output_shape[2]; + index_t channels = output_shape[3]; + + index_t input_height = input->dim(1); + index_t input_width = input->dim(2); + index_t input_channels = input->dim(3); + + int pad_top = paddings[0] >> 1; + int pad_left = paddings[1] >> 1; + + MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", + input_channels); + + std::function conv_func; + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + bool use_1x1 = filter_h == 1 && filter_w == 1; + + std::vector padded_output_shape = output_shape; + index_t tile_w, tile_c = 4; + if (use_1x1) { + tile_w = 2; + } else { + tile_w = 4; + } + padded_output_shape[2] = RoundUp(width, tile_w); + + std::vector padded_input_shape = input->shape(); + padded_input_shape[1] = input_height + paddings[0]; + padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + + (filter_w - 1) * dilations[1] + 1; + padded_input_shape[3] = RoundUp(input_channels, tile_c); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[1] != input_height || + padded_input_shape[2] != input_width || + padded_input_shape[3] != input_channels) { + // decide scratch size before allocate it + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, pad_top, pad_left, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + if (use_1x1) { + conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { + return conv2d::Conv2d1x1( + context, &kernels_[1], pad_input, filter, bias, strides, + activation, relux_max_limit, + leakyrelu_coefficient, input_changed, output, &conv_future); + }; + } else { + conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { + return conv2d::Conv2dGeneral( + context, &kernels_[1], pad_input, filter, bias, strides, dilations, + activation, relux_max_limit, + leakyrelu_coefficient, input_changed, output, &conv_future); + }; + } + MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output)); + MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future()); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h index 4ef8d79d9304143d29ba35125ad0b0970af310cb..c50752c3bc6abeaaabc961084d72e8f7afba9f76 100644 --- a/mace/ops/opencl/buffer/conv_2d.h +++ b/mace/ops/opencl/buffer/conv_2d.h @@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context, const Tensor *filter, const Tensor *bias, const int *strides, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, const Tensor *bias, const int *strides, const int *dilations, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, StatsFuture *future); } // namespace conv2d -template class Conv2dKernel : public OpenCLConv2dKernel { public: Conv2dKernel() : old_scratch_size_(0) {} @@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel { std::vector input_shape_; }; - -template -bool Conv2dKernel::CheckUseWinograd( - OpenCLRuntime *runtime, - const std::vector &filter_shape, - const std::vector &output_shape, - const int *strides, - const int *dilations, - int *wino_block_size) { - MACE_UNUSED(runtime); - MACE_UNUSED(output_shape); - MACE_UNUSED(wino_block_size); - return (filter_shape[2] == 3 && filter_shape[3] == 3 && - strides[0] == 1 && strides[1] == 1 && - dilations[0] == 1 && dilations[1] == 1); -} - -template -MaceStatus Conv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const int winograd_blk_size, - Tensor *output) { - MACE_UNUSED(winograd_blk_size); - StatsFuture pad_future, conv_future; - index_t filter_h = filter->dim(2); - index_t filter_w = filter->dim(3); - // Reshape output - std::vector output_shape(4); - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), filter->shape().data(), - padding_data.data(), dilations, strides, RoundType::FLOOR, - output_shape.data()); - } - - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - - // calculate padded input shape - index_t width = output_shape[2]; - index_t channels = output_shape[3]; - - index_t input_height = input->dim(1); - index_t input_width = input->dim(2); - index_t input_channels = input->dim(3); - - int pad_top = paddings[0] >> 1; - int pad_left = paddings[1] >> 1; - - MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); - MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", - input_channels); - - std::function conv_func; - - // Mark whether input changed or not - bool input_changed = !IsVecEqual(input_shape_, input->shape()); - input_shape_ = input->shape(); - - bool use_1x1 = filter_h == 1 && filter_w == 1; - - std::vector padded_output_shape = output_shape; - index_t tile_w, tile_c = 4; - if (use_1x1) { - tile_w = 2; - } else { - tile_w = 4; - } - padded_output_shape[2] = RoundUp(width, tile_w); - - std::vector padded_input_shape = input->shape(); - padded_input_shape[1] = input_height + paddings[0]; - padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + - (filter_w - 1) * dilations[1] + 1; - padded_input_shape[3] = RoundUp(input_channels, tile_c); - - const Tensor *padded_input_ptr = input; - // pad input - std::unique_ptr padded_input; - if (padded_input_shape[1] != input_height || - padded_input_shape[2] != input_width || - padded_input_shape[3] != input_channels) { - // decide scratch size before allocate it - index_t total_scratch_size = 0; - index_t padded_input_size = 0; - - padded_input_size = - std::accumulate(padded_input_shape.begin(), - padded_input_shape.end(), - 1, - std::multiplies()) - * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; - total_scratch_size += padded_input_size; - - // Init scratch buffer - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(total_scratch_size); - if (old_scratch_size_ != scratch->size()) { - input_changed |= scratch->size() != old_scratch_size_; - old_scratch_size_ = scratch->size(); - } - - padded_input = make_unique(scratch->Scratch(padded_input_size), - input->dtype()); - - padded_input->Resize(padded_input_shape); - PadInput(context, &kernels_[0], input, pad_top, pad_left, - input_changed, padded_input.get(), &pad_future); - padded_input_ptr = padded_input.get(); - } - - if (use_1x1) { - conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { - return conv2d::Conv2d1x1( - context, &kernels_[1], pad_input, filter, bias, strides, - DataTypeToEnum::v(), activation, relux_max_limit, - leakyrelu_coefficient, input_changed, output, &conv_future); - }; - } else { - conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { - return conv2d::Conv2dGeneral( - context, &kernels_[1], pad_input, filter, bias, strides, dilations, - DataTypeToEnum::v(), activation, relux_max_limit, - leakyrelu_coefficient, input_changed, output, &conv_future); - }; - } - MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output)); - MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future()); - - return MaceStatus::MACE_SUCCESS; -} - } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc index bfe6775e91b0bf673365e2db4b634a57e10029bc..6eeb0f1d1584eb4eb14fd749602895437286e766 100644 --- a/mace/ops/opencl/buffer/conv_2d_1x1.cc +++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc @@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context, const Tensor *filter, const Tensor *bias, const int *strides, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); built_options.emplace("-Dconv2d=" + kernel_name); - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + std::string data_dt = DtToCLDt(padded_input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc index f2090a1bb6d5d69b89a14bedb9118470c59c8c01..b19b702083bbdeb2f94b2d6ab8e7e13a02c3ab12 100644 --- a/mace/ops/opencl/buffer/conv_2d_general.cc +++ b/mace/ops/opencl/buffer/conv_2d_general.cc @@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context, const Tensor *bias, const int *strides, const int *dilations, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); built_options.emplace("-Dconv2d=" + kernel_name); - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + std::string pad_data_dt = DtToCLDt(padded_input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt); + std::string out_data_dt = DtToCLDt(output->dtype()); + built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.cc b/mace/ops/opencl/buffer/depthwise_conv2d.cc index d9e1c2c054ee3280b3515a39f480c72ce9f96c43..48c9829f4cd3ad04daf95b5d1964807b9e0a0e67 100644 --- a/mace/ops/opencl/buffer/depthwise_conv2d.cc +++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc @@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, const Tensor *bias, const int *strides, const int *dilations, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context, std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context, } } // namespace depthwise + +MaceStatus DepthwiseConv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + Tensor *output) { + StatsFuture pad_future, dw_conv_future; + index_t filter_w = filter->dim(3); + + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->dim(0) * filter->dim(1); + fake_filter_shape[1] = filter->dim(1); + fake_filter_shape[2] = filter->dim(2); + fake_filter_shape[3] = filter->dim(3); + + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), fake_filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // calculate padded input shape + index_t width = output_shape[2]; + index_t channels = output_shape[3]; + + index_t input_height = input->dim(1); + index_t input_width = input->dim(2); + index_t input_channels = input->dim(3); + + int pad_top = paddings[0] >> 1; + int pad_left = paddings[1] >> 1; + + MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported"); + MACE_CHECK(filter->dim(0) * input_channels == channels); + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", + input_channels); + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + std::vector padded_output_shape = output_shape; + index_t tile_w = 4, tile_c = 4; + padded_output_shape[2] = RoundUp(width, tile_w); + + std::vector padded_input_shape = input->shape(); + padded_input_shape[1] = input_height + paddings[0]; + padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + + (filter_w - 1) * dilations[1] + 1; + padded_input_shape[3] = RoundUp(input_channels, tile_c); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[1] != input_height || + padded_input_shape[2] != input_width || + padded_input_shape[3] != input_channels) { + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, pad_top, pad_left, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + MACE_RETURN_IF_ERROR( + depthwise::DepthwiseConv2d( + context, &kernels_[1], padded_input_ptr, filter, bias, strides, + dilations, activation, relux_max_limit, + leakyrelu_coefficient, input_changed, output, &dw_conv_future)); + MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future()); + return MaceStatus::MACE_SUCCESS; +} + } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h index 6a46334a787378441d84d020cf578042e6bd24b9..98dffa12734b8404221869d147420a2e76866224 100644 --- a/mace/ops/opencl/buffer/depthwise_conv2d.h +++ b/mace/ops/opencl/buffer/depthwise_conv2d.h @@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, const Tensor *bias, const int *strides, const int *dilations, - const DataType dt, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, @@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, StatsFuture *future); } // namespace depthwise - -template class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { public: DepthwiseConv2dKernel() : old_scratch_size_(0) {} @@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { private: index_t old_scratch_size_; cl::Kernel kernels_[2]; - uint32_t kwg_size_; std::vector input_shape_; }; -template -MaceStatus DepthwiseConv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - Tensor *output) { - StatsFuture pad_future, dw_conv_future; - index_t filter_w = filter->dim(3); - - // Create a fake conv_2d filter to calculate the paddings and output size - std::vector fake_filter_shape(4); - fake_filter_shape[0] = filter->dim(0) * filter->dim(1); - fake_filter_shape[1] = filter->dim(1); - fake_filter_shape[2] = filter->dim(2); - fake_filter_shape[3] = filter->dim(3); - - std::vector output_shape(4); - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), fake_filter_shape.data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), fake_filter_shape.data(), - padding_data.data(), dilations, strides, RoundType::FLOOR, - output_shape.data()); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - - // calculate padded input shape - index_t width = output_shape[2]; - index_t channels = output_shape[3]; - - index_t input_height = input->dim(1); - index_t input_width = input->dim(2); - index_t input_channels = input->dim(3); - - int pad_top = paddings[0] >> 1; - int pad_left = paddings[1] >> 1; - - MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported"); - MACE_CHECK(filter->dim(0) * input_channels == channels); - MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", - input_channels); - - // Mark whether input changed or not - bool input_changed = !IsVecEqual(input_shape_, input->shape()); - input_shape_ = input->shape(); - - std::vector padded_output_shape = output_shape; - index_t tile_w = 4, tile_c = 4; - padded_output_shape[2] = RoundUp(width, tile_w); - - std::vector padded_input_shape = input->shape(); - padded_input_shape[1] = input_height + paddings[0]; - padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + - (filter_w - 1) * dilations[1] + 1; - padded_input_shape[3] = RoundUp(input_channels, tile_c); - - const Tensor *padded_input_ptr = input; - // pad input - std::unique_ptr padded_input; - if (padded_input_shape[1] != input_height || - padded_input_shape[2] != input_width || - padded_input_shape[3] != input_channels) { - index_t total_scratch_size = 0; - index_t padded_input_size = 0; - - padded_input_size = - std::accumulate(padded_input_shape.begin(), - padded_input_shape.end(), - 1, - std::multiplies()) - * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; - total_scratch_size += padded_input_size; - - // Init scratch buffer - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(total_scratch_size); - if (old_scratch_size_ != scratch->size()) { - input_changed |= scratch->size() != old_scratch_size_; - old_scratch_size_ = scratch->size(); - } - - padded_input = make_unique(scratch->Scratch(padded_input_size), - input->dtype()); - - padded_input->Resize(padded_input_shape); - PadInput(context, &kernels_[0], input, pad_top, pad_left, - input_changed, padded_input.get(), &pad_future); - padded_input_ptr = padded_input.get(); - } - - MACE_RETURN_IF_ERROR( - depthwise::DepthwiseConv2d( - context, &kernels_[1], padded_input_ptr, filter, bias, strides, - dilations, DataTypeToEnum::v(), activation, relux_max_limit, - leakyrelu_coefficient, input_changed, output, &dw_conv_future)); - MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future()); - return MaceStatus::MACE_SUCCESS; -} - } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/pooling.cc b/mace/ops/opencl/buffer/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..e19d1ab04ebd3faea1067e6c0d4ec548c61a0cc5 --- /dev/null +++ b/mace/ops/opencl/buffer/pooling.cc @@ -0,0 +1,174 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/buffer/pooling.h" + + +namespace mace { +namespace ops { +namespace opencl { +namespace buffer { + +MaceStatus PoolingKernel::Compute( + OpContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const RoundType round_type, + Tensor *output) { + MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) + << "Pooling opencl kernel not support dilation yet"; + + StatsFuture pad_future, pooling_future; + + index_t input_channels = input->dim(3); + + std::vector output_shape(4); + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels[0], kernels[1]}; + + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter_shape.data(), + padding_data.data(), dilations, strides, round_type, + output_shape.data()); + } + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + + // pad input + std::vector padded_input_shape = input->shape(); + padded_input_shape[3] = RoundUp(input_channels, 4); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[3] != input_channels) { + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, 0, 0, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + cl::Kernel *kernel = &kernels_[1]; + MACE_OUT_OF_RANGE_DEFINITION + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); + built_options.emplace("-Dpooling=" + kernel_name); + auto input_dtype = input->dtype(); + auto input_dt = DtToCLDt(input_dtype); + built_options.emplace("-DIN_DATA_TYPE=" + input_dt); + auto output_dtype = output->dtype(); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype)); + if (pooling_type == MAX && input_dtype == output_dtype) { + built_options.emplace("-DDATA_TYPE=" + input_dt); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + } + if (pooling_type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer", + kernel_name, + built_options, + kernel)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output->dim(3))), + static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)), + }; + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer())); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(1))); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(2))); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(3))); + kernel->setArg(idx++, static_cast(output->dim(1))); + kernel->setArg(idx++, static_cast(output->dim(3))); + kernel->setArg(idx++, paddings[0] / 2); + kernel->setArg(idx++, paddings[1] / 2); + kernel->setArg(idx++, strides[0]); + kernel->setArg(idx++, strides[1]); + kernel->setArg(idx++, kernels[0]); + kernel->setArg(idx++, kernels[1]); + kernel->setArg(idx++, *(output->opencl_buffer())); + } + + const std::vector lws = {4, 4, 4, 0}; + std::string tuning_key = + Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, + gws, lws, &pooling_future)); + MACE_OUT_OF_RANGE_VALIDATION + MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future()); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h index 4f153e4acfff75ab179e567803e05e14f67ceebf..9e675e29aa14bd12409f0a1315fe34c023a73b5d 100644 --- a/mace/ops/opencl/buffer/pooling.h +++ b/mace/ops/opencl/buffer/pooling.h @@ -31,7 +31,6 @@ namespace ops { namespace opencl { namespace buffer { -template class PoolingKernel : public OpenCLPoolingKernel { public: PoolingKernel() : old_scratch_size_(0) {} @@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel { std::vector input_shape_; }; -template -MaceStatus PoolingKernel::Compute( - OpContext *context, - const Tensor *input, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const RoundType round_type, - Tensor *output) { - MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) - << "Pooling opencl kernel not support dilation yet"; - - StatsFuture pad_future, pooling_future; - - index_t input_channels = input->dim(3); - - std::vector output_shape(4); - std::vector filter_shape = {input->dim(3), input->dim(3), - kernels[0], kernels[1]}; - - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), filter_shape.data(), - padding_data.data(), dilations, strides, round_type, - output_shape.data()); - } - - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - - // Mark whether input changed or not - bool input_changed = !IsVecEqual(input_shape_, input->shape()); - input_shape_ = input->shape(); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - - // pad input - std::vector padded_input_shape = input->shape(); - padded_input_shape[3] = RoundUp(input_channels, 4); - - const Tensor *padded_input_ptr = input; - // pad input - std::unique_ptr padded_input; - if (padded_input_shape[3] != input_channels) { - index_t total_scratch_size = 0; - index_t padded_input_size = 0; - - padded_input_size = - std::accumulate(padded_input_shape.begin(), - padded_input_shape.end(), - 1, - std::multiplies()) - * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; - total_scratch_size += padded_input_size; - - // Init scratch buffer - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(total_scratch_size); - if (old_scratch_size_ != scratch->size()) { - input_changed |= scratch->size() != old_scratch_size_; - old_scratch_size_ = scratch->size(); - } - - padded_input = make_unique(scratch->Scratch(padded_input_size), - input->dtype()); - - padded_input->Resize(padded_input_shape); - PadInput(context, &kernels_[0], input, 0, 0, - input_changed, padded_input.get(), &pad_future); - padded_input_ptr = padded_input.get(); - } - - cl::Kernel *kernel = &kernels_[1]; - MACE_OUT_OF_RANGE_DEFINITION - - if (kernel->get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); - built_options.emplace("-Dpooling=" + kernel_name); - - if (pooling_type == MAX && input->dtype() == output->dtype()) { - built_options.emplace("-DIN_DATA_TYPE=" + - DtToCLDt(input->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - } else { - built_options.emplace("-DIN_DATA_TYPE=" + - DtToCLDt(input->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - } - if (pooling_type == AVG) { - built_options.emplace("-DPOOL_AVG"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer", - kernel_name, - built_options, - kernel)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - } - - const uint32_t gws[3] = { - static_cast(RoundUpDiv4(output->dim(3))), - static_cast(output->dim(2)), - static_cast(output->dim(0) * output->dim(1)), - }; - - MACE_OUT_OF_RANGE_INIT(*kernel); - if (input_changed) { - uint32_t idx = 0; - MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); - MACE_SET_3D_GWS_ARGS(*kernel, gws); - kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer())); - kernel->setArg(idx++, static_cast(padded_input_ptr->dim(1))); - kernel->setArg(idx++, static_cast(padded_input_ptr->dim(2))); - kernel->setArg(idx++, static_cast(padded_input_ptr->dim(3))); - kernel->setArg(idx++, static_cast(output->dim(1))); - kernel->setArg(idx++, static_cast(output->dim(3))); - kernel->setArg(idx++, paddings[0] / 2); - kernel->setArg(idx++, paddings[1] / 2); - kernel->setArg(idx++, strides[0]); - kernel->setArg(idx++, strides[1]); - kernel->setArg(idx++, kernels[0]); - kernel->setArg(idx++, kernels[1]); - kernel->setArg(idx++, *(output->opencl_buffer())); - } - - const std::vector lws = {4, 4, 4, 0}; - std::string tuning_key = - Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, &pooling_future)); - MACE_OUT_OF_RANGE_VALIDATION - MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future()); - - return MaceStatus::MACE_SUCCESS; -} - } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer/softmax.cc b/mace/ops/opencl/buffer/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc70ea93a07f35d3daa44617f983a954392b8485 --- /dev/null +++ b/mace/ops/opencl/buffer/softmax.cc @@ -0,0 +1,99 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/buffer/softmax.h" + + +namespace mace { +namespace ops { +namespace opencl { +namespace buffer { + +MaceStatus SoftmaxKernel::Compute( + OpContext *context, + const Tensor *logits, + Tensor *output) { + index_t batch = 0; + index_t height = 0; + index_t width = 0; + index_t channels = 0; + + if (logits->dim_size() == 2) { + batch = logits->dim(0); + height = 1; + width = 1; + channels = logits->dim(1); + + } else if (logits->dim_size() == 4) { + batch = logits->dim(0); + height = logits->dim(1); + width = logits->dim(2); + channels = logits->dim(3); + } else { + MACE_NOT_IMPLEMENTED; + } + + const index_t channel_blocks = RoundUpDiv4(channels); + const int remain_channels = channel_blocks * 4 - channels; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); + built_options.emplace("-Dsoftmax=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + if (use_log_) built_options.emplace("-DUSE_LOG"); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, logits->shape())) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size()); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(logits->opencl_buffer())); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, remain_channels); + kernel_.setArg(idx++, *(output->opencl_buffer())); + + input_shape_ = logits->shape(); + } + + std::vector lws = {4, 4, 4, 0}; + std::string tuning_key = + Concat("softmax_opencl_kernel", batch, height, width, channels); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION + return MaceStatus::MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h index 3ab6a7cef1bd1d760ea70e1409f687d664f51996..05d27cac7f4bdcd408c6b25b958e6414bde8249a 100644 --- a/mace/ops/opencl/buffer/softmax.h +++ b/mace/ops/opencl/buffer/softmax.h @@ -29,7 +29,7 @@ namespace mace { namespace ops { namespace opencl { namespace buffer { -template + class SoftmaxKernel : public OpenCLSoftmaxKernel { public: explicit SoftmaxKernel(bool use_log) @@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { std::vector input_shape_; }; -template -MaceStatus SoftmaxKernel::Compute( - OpContext *context, - const Tensor *logits, - Tensor *output) { - index_t batch = 0; - index_t height = 0; - index_t width = 0; - index_t channels = 0; - - if (logits->dim_size() == 2) { - batch = logits->dim(0); - height = 1; - width = 1; - channels = logits->dim(1); - - } else if (logits->dim_size() == 4) { - batch = logits->dim(0); - height = logits->dim(1); - width = logits->dim(2); - channels = logits->dim(3); - } else { - MACE_NOT_IMPLEMENTED; - } - - const index_t channel_blocks = RoundUpDiv4(channels); - const int remain_channels = channel_blocks * 4 - channels; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); - built_options.emplace("-Dsoftmax=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype())); - built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - if (use_log_) built_options.emplace("-DUSE_LOG"); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, logits->shape())) { - uint32_t idx = 0; - MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size()); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(logits->opencl_buffer())); - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, remain_channels); - kernel_.setArg(idx++, *(output->opencl_buffer())); - - input_shape_ = logits->shape(); - } - - std::vector lws = {4, 4, 4, 0}; - std::string tuning_key = - Concat("softmax_opencl_kernel", batch, height, width, channels); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION - return MaceStatus::MACE_SUCCESS; -} - } // namespace buffer } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/buffer_transform.cc b/mace/ops/opencl/buffer_transform.cc index 7e59b339642b571b7bc08f09af1b07814096eaf0..fc1d9dcc2c514d289baa3f56bced871723e778fc 100644 --- a/mace/ops/opencl/buffer_transform.cc +++ b/mace/ops/opencl/buffer_transform.cc @@ -20,11 +20,11 @@ namespace mace { namespace ops { -template +template class BufferTransformOp; -template -class BufferTransformOp : public Operation { +template<> +class BufferTransformOp : public Operation { public: explicit BufferTransformOp(OpConstructContext *context) : Operation(context), @@ -42,7 +42,7 @@ class BufferTransformOp : public Operation { MemoryType in_mem_type = context->workspace()->GetTensor( operator_def_->input(0))->memory_type(); - return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( + return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( context, input, type, out_mem_type_, wino_blk_size_, output); } @@ -51,13 +51,8 @@ class BufferTransformOp : public Operation { MemoryType out_mem_type_; }; - void RegisterBufferTransform(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "BufferTransform", - BufferTransformOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "BufferTransform", - BufferTransformOp, DeviceType::GPU, half); + MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp); } } // namespace ops diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc index cda7c1331c918d8a685dc1a07fa11865afce8602..dae8385644bc8e74c8c4059b75c110600588ba91 100644 --- a/mace/ops/opencl/buffer_transformer.cc +++ b/mace/ops/opencl/buffer_transformer.cc @@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) { return name + postfix; } +MaceStatus TransformFilter( + mace::OpConstructContext *context, + OperatorDef *op_def, + const int input_idx, + const OpenCLBufferType buffer_type, + const MemoryType mem_type, + const int wino_blk_size) { + OpContext op_context(context->workspace(), context->device()); + Workspace *ws = context->workspace(); + std::string input_name = op_def->input(input_idx); + Tensor *input = ws->GetTensor(input_name); + const DataType dt = input->dtype(); + std::string output_name = TransformedFilterName(input_name); + Tensor *output = + ws->CreateTensor(output_name, context->device()->allocator(), dt, true); + + // update the information + op_def->set_input(input_idx, output_name); + input->MarkUnused(); + return OpenCLBufferTransformer(input->memory_type(), mem_type). + Transform(&op_context, input, buffer_type, mem_type, wino_blk_size, + output); +} + } // namespace ops } // namespace mace diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index d2ef505825eceee5dfb43629ddc250636f952540..f3df8bc4452766b8a15d579f55aae09722c9a48e 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -28,17 +28,16 @@ namespace mace { namespace ops { // Only used for GPU Operation(BufferTransform) -template class OpenCLBufferTransformer { public: OpenCLBufferTransformer(const MemoryType in_mem_type, const MemoryType out_mem_type) { if (out_mem_type == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else if (in_mem_type == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { - kernel_ = make_unique>(); + kernel_ = make_unique(); } } @@ -49,7 +48,7 @@ class OpenCLBufferTransformer { const int wino_blk_size, Tensor *output) { Workspace *ws = context->workspace(); - DataType dt = DataTypeToEnum::value; + DataType dt = output->dtype(); MemoryType in_mem_type = input->memory_type(); if (out_mem_type == MemoryType::GPU_IMAGE || out_mem_type == MemoryType::GPU_BUFFER) { @@ -87,10 +86,10 @@ class OpenCLBufferTransformer { << " to CPU Buffer " << output->name() << " with data type " << dt; Tensor::MappingGuard guard(&internal_tensor); - const T *internal_ptr = internal_tensor.data(); + const float *internal_ptr = internal_tensor.data(); output->Resize(internal_tensor.shape()); - T *output_ptr = output->mutable_data(); - memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + float *output_ptr = output->mutable_data(); + memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float)); return MaceStatus::MACE_SUCCESS; } else { LOG(FATAL) << "Unexpected error: " << out_mem_type; @@ -110,30 +109,13 @@ class OpenCLBufferTransformer { std::string TransformedFilterName(const std::string &name); -template MaceStatus TransformFilter( mace::OpConstructContext *context, OperatorDef *op_def, const int input_idx, const OpenCLBufferType buffer_type, const MemoryType mem_type, - const int wino_blk_size = 0) { - const DataType dt = DataTypeToEnum::value; - OpContext op_context(context->workspace(), context->device()); - Workspace *ws = context->workspace(); - std::string input_name = op_def->input(input_idx); - Tensor *input = ws->GetTensor(input_name); - std::string output_name = TransformedFilterName(input_name); - Tensor *output = - ws->CreateTensor(output_name, context->device()->allocator(), dt, true); - - // update the information - op_def->set_input(input_idx, output_name); - input->MarkUnused(); - return OpenCLBufferTransformer(input->memory_type(), mem_type). - Transform(&op_context, input, buffer_type, mem_type, wino_blk_size, - output); -} + const int wino_blk_size = 0); } // namespace ops } // namespace mace diff --git a/mace/ops/opencl/conv_2d.h b/mace/ops/opencl/conv_2d.h index a9ec131d18ef898cb493f4f7ba0bc73fcacc7f07..d6dd40bd6d05c5e5d96af649190c6b9a1ef60822 100644 --- a/mace/ops/opencl/conv_2d.h +++ b/mace/ops/opencl/conv_2d.h @@ -17,8 +17,9 @@ #include -#include "mace/ops/activation.h" +#include "mace/ops/common/activation_type.h" #include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { class OpContext; diff --git a/mace/ops/opencl/deconv_2d.h b/mace/ops/opencl/deconv_2d.h index 282a6dd888ff3ed0bb442067846b836fbad7291a..3335bebf967ba0321d30cce0ff0b249fcffcacd0 100644 --- a/mace/ops/opencl/deconv_2d.h +++ b/mace/ops/opencl/deconv_2d.h @@ -17,7 +17,10 @@ #include -#include "mace/ops/activation.h" +#include "mace/core/types.h" +#include "mace/ops/common/activation_type.h" +#include "mace/public/mace.h" +#include "mace/utils/macros.h" namespace mace { diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h index b2460fcda74e67ff33c9e3dee10ba53dc840fff4..462010729589fcee949f6d64c2387de55f0e44a8 100644 --- a/mace/ops/opencl/depthwise_deconv2d.h +++ b/mace/ops/opencl/depthwise_deconv2d.h @@ -19,6 +19,9 @@ #include #include "mace/ops/common/activation_type.h" +#include "mace/public/mace.h" +#include "mace/utils/macros.h" +#include "mace/core/types.h" namespace mace { diff --git a/mace/ops/opencl/fully_connected.h b/mace/ops/opencl/fully_connected.h index 416aed6c8692ceaf45da1d1eb36f82b3753c8729..88c1cbaba293fcb42c059b46f5e62e0bcd9de70c 100644 --- a/mace/ops/opencl/fully_connected.h +++ b/mace/ops/opencl/fully_connected.h @@ -15,8 +15,7 @@ #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_ #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_ -#include "mace/ops/activation.h" - +#include "mace/ops/common/activation_type.h" #include "mace/public/mace.h" #include "mace/utils/math.h" diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc index 912a8d8d87e549290cf5d174187d288c2462fcb1..9729555a5ce246a1cb4277c61bf5d5de9f16bbd1 100644 --- a/mace/ops/opencl/helper.cc +++ b/mace/ops/opencl/helper.cc @@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) { } } -std::string DtToUpCompatibleCLDt(const DataType dt) { - switch (dt) { - case DT_FLOAT: - case DT_HALF: - return "float"; - default: - LOG(FATAL) << "Unsupported data type"; - return ""; - } -} - -std::string DtToUpCompatibleCLCMDDt(const DataType dt) { - switch (dt) { - case DT_FLOAT: - case DT_HALF: - return "f"; - default: - LOG(FATAL) << "Not supported data type for opencl cmd data type"; - return ""; - } -} - std::vector Default3DLocalWS(OpenCLRuntime *runtime, const uint32_t *gws, const uint32_t kwg_size) { diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h index a4a49b4e15a021f1fa55fbd39c514777f03005bd..a9e9866c31e85bd82efb1d1b2622d429f8639c5a 100644 --- a/mace/ops/opencl/helper.h +++ b/mace/ops/opencl/helper.h @@ -100,17 +100,9 @@ std::vector FormatBufferShape( // CPU data type to OpenCL command data type std::string DtToCLCMDDt(const DataType dt); -// CPU data type to upward compatible OpenCL command data type -// e.g. half -> float -std::string DtToUpCompatibleCLCMDDt(const DataType dt); - // CPU data type to OpenCL data type std::string DtToCLDt(const DataType dt); -// CPU data type to upward compatible OpenCL data type -// e.g. half -> float -std::string DtToUpCompatibleCLDt(const DataType dt); - // CPU data type to OpenCL condition data type used in select // e.g. half -> float std::string DtToCLCondDt(const DataType dt); diff --git a/mace/ops/opencl/image/activation.cc b/mace/ops/opencl/image/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c8ed331820cb23801fb346d645ed0f7a138936d --- /dev/null +++ b/mace/ops/opencl/image/activation.cc @@ -0,0 +1,123 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/activation.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ActivationKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *alpha, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); + built_options.emplace("-Dactivation=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + switch (activation_) { + case RELU: { + tuning_key_prefix_ = "relu_opencl_kernel"; + built_options.emplace("-DUSE_RELU"); + break; + } + case RELUX: { + tuning_key_prefix_ = "relux_opencl_kernel"; + built_options.emplace("-DUSE_RELUX"); + break; + } + case PRELU: { + tuning_key_prefix_ = "prelu_opencl_kernel"; + built_options.emplace("-DUSE_PRELU"); + break; + } + case TANH: { + tuning_key_prefix_ = "tanh_opencl_kernel"; + built_options.emplace("-DUSE_TANH"); + break; + } + case SIGMOID: { + tuning_key_prefix_ = "sigmoid_opencl_kernel"; + built_options.emplace("-DUSE_SIGMOID"); + break; + } + case LEAKYRELU: { + tuning_key_prefix_ = "leakyrelu_opencl_kernel"; + built_options.emplace("-DUSE_LEAKYRELU"); + break; + } + default: { + LOG(FATAL) << "Unknown activation type: " << activation_; + } + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + int idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + if (activation_ == PRELU) { + MACE_CHECK_NOTNULL(alpha); + kernel_.setArg(idx++, *(alpha->opencl_image())); + } + kernel_.setArg(idx++, relux_max_limit_); + kernel_.setArg(idx++, leakyrelu_coefficient_); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), + output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace + diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h index 6f7c573cec0c3016ac247e095d6148da158e3301..e98b5e9daefe0cf988b6cb39ee7e0cf4903ea89b 100644 --- a/mace/ops/opencl/image/activation.h +++ b/mace/ops/opencl/image/activation.h @@ -31,12 +31,11 @@ namespace ops { namespace opencl { namespace image { -template class ActivationKernel : public OpenCLActivationKernel { public: ActivationKernel(ActivationType type, - T relux_max_limit, - T leakyrelu_coefficient) + float relux_max_limit, + float leakyrelu_coefficient) : activation_(type), relux_max_limit_(relux_max_limit), leakyrelu_coefficient_(leakyrelu_coefficient) {} @@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel { private: ActivationType activation_; - T relux_max_limit_; - T leakyrelu_coefficient_; + float relux_max_limit_; + float leakyrelu_coefficient_; cl::Kernel kernel_; uint32_t kwg_size_; std::vector input_shape_; std::string tuning_key_prefix_; }; -template -MaceStatus ActivationKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *alpha, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); - built_options.emplace("-Dactivation=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - switch (activation_) { - case RELU: - tuning_key_prefix_ = "relu_opencl_kernel"; - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - tuning_key_prefix_ = "relux_opencl_kernel"; - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - tuning_key_prefix_ = "prelu_opencl_kernel"; - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - tuning_key_prefix_ = "tanh_opencl_kernel"; - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - tuning_key_prefix_ = "sigmoid_opencl_kernel"; - built_options.emplace("-DUSE_SIGMOID"); - break; - case LEAKYRELU: - tuning_key_prefix_ = "leakyrelu_opencl_kernel"; - built_options.emplace("-DUSE_LEAKYRELU"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - int idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - if (activation_ == PRELU) { - MACE_CHECK_NOTNULL(alpha); - kernel_.setArg(idx++, *(alpha->opencl_image())); - } - kernel_.setArg(idx++, static_cast(relux_max_limit_)); - kernel_.setArg(idx++, static_cast(leakyrelu_coefficient_)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), - output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/addn.cc b/mace/ops/opencl/image/addn.cc new file mode 100644 index 0000000000000000000000000000000000000000..7bb38e01b7188e406a0fb13e48d4116c5253a69d --- /dev/null +++ b/mace/ops/opencl/image/addn.cc @@ -0,0 +1,106 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/addn.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus AddNKernel::Compute( + OpContext *context, + const std::vector &input_tensors, + Tensor *output_tensor) { + size_t size = input_tensors.size(); + MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); + + const index_t batch = input_tensors[0]->dim(0); + const index_t height = input_tensors[0]->dim(1); + const index_t width = input_tensors[0]->dim(2); + const index_t channels = input_tensors[0]->dim(3); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + for (size_t i = 1; i < size; ++i) { + MACE_CHECK_NOTNULL(input_tensors[i]); + MACE_CHECK(batch == input_tensors[i]->dim(0)); + MACE_CHECK(height == input_tensors[i]->dim(1)); + MACE_CHECK(width == input_tensors[i]->dim(2)); + MACE_CHECK(channels == input_tensors[i]->dim(3)); + } + + if (kernel_.get() == nullptr) { + if (input_tensors.size() > 4) { + MACE_NOT_IMPLEMENTED; + } + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); + built_options.emplace("-Daddn=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + std::vector output_shape = input_tensors[0]->shape(); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_pixels = channel_blocks * width; + const index_t batch_height_pixels = batch * height; + + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + output_tensor->ResizeImage(output_shape, output_image_shape)); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + for (auto input : input_tensors) { + kernel_.setArg(idx++, *(input->opencl_image())); + } + kernel_.setArg(idx++, *(output_tensor->opencl_image())); + + input_shape_ = input_tensors[0]->shape(); + } + + const std::vector lws = {kwg_size_ / 16, 16, 0}; + std::string tuning_key = + Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), + output_tensor->dim(2), output_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h index 088dd322d0619205615292cbe0ca355444633b92..b163152bf15838c385b38690c75f8f92499b5ae2 100644 --- a/mace/ops/opencl/image/addn.h +++ b/mace/ops/opencl/image/addn.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class AddNKernel : public OpenCLAddNKernel { public: MaceStatus Compute( @@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel { std::vector input_shape_; }; -template -MaceStatus AddNKernel::Compute( - OpContext *context, - const std::vector &input_tensors, - Tensor *output_tensor) { - size_t size = input_tensors.size(); - MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); - - const index_t batch = input_tensors[0]->dim(0); - const index_t height = input_tensors[0]->dim(1); - const index_t width = input_tensors[0]->dim(2); - const index_t channels = input_tensors[0]->dim(3); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - for (size_t i = 1; i < size; ++i) { - MACE_CHECK_NOTNULL(input_tensors[i]); - MACE_CHECK(batch == input_tensors[i]->dim(0)); - MACE_CHECK(height == input_tensors[i]->dim(1)); - MACE_CHECK(width == input_tensors[i]->dim(2)); - MACE_CHECK(channels == input_tensors[i]->dim(3)); - } - - if (kernel_.get() == nullptr) { - if (input_tensors.size() > 4) { - MACE_NOT_IMPLEMENTED; - } - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); - built_options.emplace("-Daddn=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - std::vector output_shape = input_tensors[0]->shape(); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t width_pixels = channel_blocks * width; - const index_t batch_height_pixels = batch * height; - - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - output_tensor->ResizeImage(output_shape, output_image_shape)); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - for (auto input : input_tensors) { - kernel_.setArg(idx++, *(input->opencl_image())); - } - kernel_.setArg(idx++, *(output_tensor->opencl_image())); - - input_shape_ = input_tensors[0]->shape(); - } - - const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::string tuning_key = - Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), - output_tensor->dim(2), output_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/batch_norm.cc b/mace/ops/opencl/image/batch_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..bfb496e77904f274d92a1846d25eeb14c12cc4aa --- /dev/null +++ b/mace/ops/opencl/image/batch_norm.cc @@ -0,0 +1,120 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/batch_norm.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +BatchNormKernel::BatchNormKernel(const float epsilon, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient) + : epsilon_(epsilon), + activation_(activation), + relux_max_limit_(relux_max_limit), + leakyrelu_coefficient_(leakyrelu_coefficient) {} + +MaceStatus BatchNormKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + Tensor *output) { + bool not_folded = (mean != nullptr && var != nullptr); + + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); + built_options.emplace("-Dbatch_norm=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + if (!not_folded) { + built_options.emplace("-DFOLDED_CONSTANT"); + } + switch (activation_) { + case NOOP:break; + case RELU:built_options.emplace("-DUSE_RELU"); + break; + case RELUX:built_options.emplace("-DUSE_RELUX"); + break; + case TANH:built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID:built_options.emplace("-DUSE_SIGMOID"); + break; + case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU"); + break; + default:LOG(FATAL) << "Unknown activation type: " << activation_; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(scale->opencl_image())); + kernel_.setArg(idx++, *(offset->opencl_image())); + if (not_folded) { + kernel_.setArg(idx++, *(mean->opencl_image())); + kernel_.setArg(idx++, *(var->opencl_image())); + kernel_.setArg(idx++, epsilon_); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit_); + kernel_.setArg(idx++, leakyrelu_coefficient_); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("batch_norm_opencl_kernel", activation_, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h index 73560343b41e907b90f5c5e81379361ac93a589c..b2201a96631fef8ddd3b1a1748550aa96897e646 100644 --- a/mace/ops/opencl/image/batch_norm.h +++ b/mace/ops/opencl/image/batch_norm.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/activation.h" +#include "mace/ops/common/activation_type.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -31,7 +31,6 @@ namespace ops { namespace opencl { namespace image { -template class BatchNormKernel : public OpenCLBatchNormKernel { public: BatchNormKernel( @@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel { std::vector input_shape_; }; -template -BatchNormKernel::BatchNormKernel(const float epsilon, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient) - : epsilon_(epsilon), - activation_(activation), - relux_max_limit_(relux_max_limit), - leakyrelu_coefficient_(leakyrelu_coefficient) {} - -template -MaceStatus BatchNormKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - Tensor *output) { - bool not_folded = (mean != nullptr && var != nullptr); - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); - built_options.emplace("-Dbatch_norm=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (!not_folded) { - built_options.emplace("-DFOLDED_CONSTANT"); - } - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - case LEAKYRELU: - built_options.emplace("-DUSE_LEAKYRELU"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(scale->opencl_image())); - kernel_.setArg(idx++, *(offset->opencl_image())); - if (not_folded) { - kernel_.setArg(idx++, *(mean->opencl_image())); - kernel_.setArg(idx++, *(var->opencl_image())); - kernel_.setArg(idx++, epsilon_); - } - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, relux_max_limit_); - kernel_.setArg(idx++, leakyrelu_coefficient_); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("batch_norm_opencl_kernel", activation_, output->dim(0), - output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/batch_to_space.cc b/mace/ops/opencl/image/batch_to_space.cc new file mode 100644 index 0000000000000000000000000000000000000000..87f5f5a61d12dbc74897f3409ed1ea49bee610a2 --- /dev/null +++ b/mace/ops/opencl/image/batch_to_space.cc @@ -0,0 +1,100 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/batch_to_space.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus BatchToSpaceKernel::Compute( + OpContext *context, + const Tensor *batch_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *space_tensor) { + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + space_tensor->ResizeImage(output_shape, output_image_shape)); + + const uint32_t chan_blk = + static_cast(RoundUpDiv4(batch_tensor->dim(3))); + + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const char *kernel_name = "batch_to_space"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto dt = batch_tensor->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, batch_tensor->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, block_shape[0]); + kernel_.setArg(idx++, block_shape[1]); + kernel_.setArg(idx++, paddings[0]); + kernel_.setArg(idx++, paddings[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); + + input_shape_ = batch_tensor->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h index 47f79c45c8bbf963f277ca35c594ce081f3bf140..a0aced7c021cdee7dfe55b3800e9da324e7abf59 100644 --- a/mace/ops/opencl/image/batch_to_space.h +++ b/mace/ops/opencl/image/batch_to_space.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { public: MaceStatus Compute( @@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { std::vector input_shape_; }; -template -MaceStatus BatchToSpaceKernel::Compute( - OpContext *context, - const Tensor *batch_tensor, - const std::vector &paddings, - const std::vector &block_shape, - const std::vector &output_shape, - Tensor *space_tensor) { - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - space_tensor->ResizeImage(output_shape, output_image_shape)); - - const uint32_t chan_blk = - static_cast(RoundUpDiv4(batch_tensor->dim(3))); - - const uint32_t gws[3] = { - chan_blk, static_cast(batch_tensor->dim(2)), - static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - const char *kernel_name = "batch_to_space"; - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, batch_tensor->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - kernel_.setArg(idx++, block_shape[0]); - kernel_.setArg(idx++, block_shape[1]); - kernel_.setArg(idx++, paddings[0]); - kernel_.setArg(idx++, paddings[2]); - kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); - - input_shape_ = batch_tensor->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), - batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/bias_add.cc b/mace/ops/opencl/image/bias_add.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f62f592f41ede1df35532775f88bae761623447 --- /dev/null +++ b/mace/ops/opencl/image/bias_add.cc @@ -0,0 +1,101 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/bias_add.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus BiasAddKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); + built_options.emplace("-Dbias_add=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, + built_options, &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(bias->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h index 6c534a4b1e9cde4fdac5a100c36a0daf2d4fd8ce..7c25662da81b183ca88588dc756b724b50ed33ac 100644 --- a/mace/ops/opencl/image/bias_add.h +++ b/mace/ops/opencl/image/bias_add.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class BiasAddKernel : public OpenCLBiasAddKernel { public: MaceStatus Compute( @@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel { std::vector input_shape_; }; -template -MaceStatus BiasAddKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *bias, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - auto dt = DataTypeToEnum::value; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); - built_options.emplace("-Dbias_add=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, - built_options, &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(bias->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/buffer_to_image.cc b/mace/ops/opencl/image/buffer_to_image.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb785e0cdbff3a5b0ef977e9894a7e93c8f0537a --- /dev/null +++ b/mace/ops/opencl/image/buffer_to_image.cc @@ -0,0 +1,164 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/buffer_to_image.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus BufferToImage::Compute( + OpContext *context, + const Tensor *input, + const OpenCLBufferType type, + const int wino_blk_size, + Tensor *output) { + auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); + std::vector image_shape; + OpenCLUtil::CalImage2DShape(formatted_buffer_shape, + type, + &image_shape, + wino_blk_size); + MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape)); + + uint32_t gws[2] = {static_cast(image_shape[0]), + static_cast(image_shape[1])}; + std::string kernel_name; + switch (type) { + case CONV2D_FILTER:kernel_name = "filter_buffer_to_image"; + break; + case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image"; + break; + case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image"; + break; + case ARGUMENT:kernel_name = "arg_buffer_to_image"; + break; + case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image"; + break; + case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image"; + break; + case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image"; + break; + case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image"; + break; + case WINOGRAD_FILTER: { + std::stringstream ss_tmp; + gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); + ss_tmp << "winograd_filter_buffer_to_image_" + << wino_blk_size << "x" << wino_blk_size; + kernel_name = ss_tmp.str(); + break; + } + } + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (input->dtype() == output->dtype()) { + auto input_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + if (type == CONV2D_FILTER) { + const index_t + inner_size = input->dim(1) * input->dim(2) * input->dim(3); + kernel_.setArg(idx++, static_cast(input->dim(0))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(input->dim(0))); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(input->dim(0))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input->shape(); + } + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + const std::vector lws = {16, kwg_size / 16}; + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h index 3d1366dfd69808e2220a20ac124003b0e04a0726..493f6579db7ced93681ad2b8b80b491edd934b8d 100644 --- a/mace/ops/opencl/image/buffer_to_image.h +++ b/mace/ops/opencl/image/buffer_to_image.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class BufferToImage : public OpenCLBufferTransformKernel { public: MaceStatus Compute( @@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel { std::vector input_shape_; }; -template -MaceStatus BufferToImage::Compute( - OpContext *context, - const Tensor *input, - const OpenCLBufferType type, - const int wino_blk_size, - Tensor *output) { - auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); - std::vector image_shape; - OpenCLUtil::CalImage2DShape(formatted_buffer_shape, - type, - &image_shape, - wino_blk_size); - MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape)); - - uint32_t gws[2] = {static_cast(image_shape[0]), - static_cast(image_shape[1])}; - std::string kernel_name; - switch (type) { - case CONV2D_FILTER: - kernel_name = "filter_buffer_to_image"; - break; - case DW_CONV2D_FILTER: - kernel_name = "dw_filter_buffer_to_image"; - break; - case IN_OUT_CHANNEL: - kernel_name = "in_out_buffer_to_image"; - break; - case ARGUMENT: - kernel_name = "arg_buffer_to_image"; - break; - case IN_OUT_HEIGHT: - kernel_name = "in_out_height_buffer_to_image"; - break; - case IN_OUT_WIDTH: - kernel_name = "in_out_width_buffer_to_image"; - break; - case WEIGHT_HEIGHT: - kernel_name = "weight_height_buffer_to_image"; - break; - case WEIGHT_WIDTH: - kernel_name = "weight_width_buffer_to_image"; - break; - case WINOGRAD_FILTER: { - std::stringstream ss_tmp; - gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); - ss_tmp << "winograd_filter_buffer_to_image_" - << wino_blk_size << "x" << wino_blk_size; - kernel_name = ss_tmp.str(); - break; - } - } - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (input->dtype() == output->dtype()) { - built_options.emplace( - "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel( - "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); - } - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_buffer())); - MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, - "buffer offset not aligned"); - kernel_.setArg(idx++, - static_cast(input->buffer_offset() / - GetEnumTypeSize(input->dtype()))); - if (type == CONV2D_FILTER) { - const index_t - inner_size = input->dim(1) * input->dim(2) * input->dim(3); - kernel_.setArg(idx++, static_cast(input->dim(0))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(input->dim(3))); - kernel_.setArg(idx++, static_cast(inner_size)); - } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { - kernel_.setArg(idx++, static_cast(input->dim(0))); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(input->dim(3))); - } else if (type == ARGUMENT) { - kernel_.setArg(idx++, static_cast(input->dim(0))); - } else { - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[1])); - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[2])); - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[3])); - } - kernel_.setArg(idx++, *(output->opencl_image())); - input_shape_ = input->shape(); - } - - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {16, kwg_size / 16}; - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/channel_shuffle.cc b/mace/ops/opencl/image/channel_shuffle.cc new file mode 100644 index 0000000000000000000000000000000000000000..6cdbb1feea4a5e77834ce066b476bc3f0162aa5d --- /dev/null +++ b/mace/ops/opencl/image/channel_shuffle.cc @@ -0,0 +1,87 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/channel_shuffle.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ChannelShuffleKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + MACE_CHECK(input->dim(3) % groups_ == 0, + "input channels must be an integral multiple of group. ", + input->dim(3)); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + const index_t channels_per_group = channels / groups_; + const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); + + const uint32_t gws[3] = {static_cast(group_channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); + built_options.emplace("-Dchannel_shuffle=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("channel_shuffle", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, groups_); + kernel_.setArg(idx++, static_cast(channels_per_group)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h index 15111c7dd65e91cea946acfdc3841e400f9a17d7..371ecf22a6cf61e3e7c60b8af4abe981f3a1264e 100644 --- a/mace/ops/opencl/image/channel_shuffle.h +++ b/mace/ops/opencl/image/channel_shuffle.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { public: explicit ChannelShuffleKernel(const int groups) : groups_(groups) {} @@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { std::vector input_shape_; }; -template -MaceStatus ChannelShuffleKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - MACE_CHECK(input->dim(3) % groups_ == 0, - "input channels must be an integral multiple of group. ", - input->dim(3)); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - const index_t channels_per_group = channels / groups_; - const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); - - const uint32_t gws[3] = {static_cast(group_channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - - MACE_OUT_OF_RANGE_DEFINITION; - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); - built_options.emplace("-Dchannel_shuffle=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("channel_shuffle", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, groups_); - kernel_.setArg(idx++, static_cast(channels_per_group)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/concat.cc b/mace/ops/opencl/image/concat.cc index d6b0bc65802547561d989167055454020e6dd1d4..f4433b43ebb44e00f0711f734ffdd1b90b0b09df 100644 --- a/mace/ops/opencl/image/concat.cc +++ b/mace/ops/opencl/image/concat.cc @@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context, cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size) { @@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context, std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); if (input0->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + auto data_dt = input0->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt)); } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); } + if (input0->dim(3) % 4 == 0) { built_options.emplace("-DDIVISIBLE_FOUR"); } @@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context, MaceStatus ConcatN(OpContext *context, cl::Kernel *kernel, const std::vector &input_list, - const DataType dt, Tensor *output, uint32_t *kwg_size) { const index_t batch = output->dim(0); @@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, built_options, kernel)); *kwg_size = @@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context, } } // namespace concat + + +MaceStatus ConcatKernel::Compute( + OpContext *context, + const std::vector &input_list, + const int32_t axis, + Tensor *output) { + const int inputs_count = input_list.size(); + + const Tensor *input0 = input_list[0]; + + std::vector output_shape(input0->shape()); + for (int i = 1; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + MACE_CHECK(input->dim_size() == input0->dim_size(), + "Ranks of all input tensors must be same."); + for (int j = 0; j < input->dim_size(); ++j) { + if (j == axis) { + continue; + } + MACE_CHECK(input->dim(j) == input0->dim(j), + "Dimensions of inputs should equal except axis."); + } + output_shape[axis] += input->dim(axis); + } + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + switch (inputs_count) { + case 2: + return concat::Concat2( + context, &kernel_, input_list[0], input_list[1], + &input_shape_, output, &kwg_size_); + default: + return concat::ConcatN(context, + &kernel_, + input_list, + output, + &kwg_size_); + } +} + } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h index 3a7af7ed15b35b6f35fb4e844f76c61b1f1a6985..f1e51fd96e7312d30419eafada40796f000c55c2 100644 --- a/mace/ops/opencl/image/concat.h +++ b/mace/ops/opencl/image/concat.h @@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context, cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size); @@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context, MaceStatus ConcatN(OpContext *context, cl::Kernel *kernel, const std::vector &input_list, - const DataType dt, Tensor *output, uint32_t *kwg_size); } // namespace concat -template class ConcatKernel : public OpenCLConcatKernel { public: ConcatKernel() {} @@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel { std::vector input_shape_; }; -template -MaceStatus ConcatKernel::Compute( - OpContext *context, - const std::vector &input_list, - const int32_t axis, - Tensor *output) { - const int inputs_count = input_list.size(); - - const Tensor *input0 = input_list[0]; - - std::vector output_shape(input0->shape()); - for (int i = 1; i < inputs_count; ++i) { - const Tensor *input = input_list[i]; - MACE_CHECK(input->dim_size() == input0->dim_size(), - "Ranks of all input tensors must be same."); - for (int j = 0; j < input->dim_size(); ++j) { - if (j == axis) { - continue; - } - MACE_CHECK(input->dim(j) == input0->dim(j), - "Dimensions of inputs should equal except axis."); - } - output_shape[axis] += input->dim(axis); - } - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - switch (inputs_count) { - case 2: - return concat::Concat2( - context, &kernel_, input_list[0], input_list[1], - DataTypeToEnum::value, &input_shape_, output, &kwg_size_); - default: - return concat::ConcatN(context, &kernel_, input_list, - DataTypeToEnum::value, output, &kwg_size_); - } -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/conv_2d.cc b/mace/ops/opencl/image/conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..20c101a2410eb11c1a29fbe7f9aa4cfefda9511f --- /dev/null +++ b/mace/ops/opencl/image/conv_2d.cc @@ -0,0 +1,185 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/conv_2d.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +bool Conv2dKernel::CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_blk_size) { + if (filter_shape[2] != 3 || filter_shape[3] != 3 || + strides[0] > 1 || strides[1] > 1 || + dilations[0] > 1 || dilations[1] > 1) { + return false; + } + index_t out_channels = filter_shape[0]; + index_t in_channels = filter_shape[1]; + auto opencl_image_max_size = runtime->GetMaxImage2DSize(); + auto check_opencl_limit = [&](int block_size) -> bool { + int sqr_block = (block_size + 2) * (block_size + 2); + uint64_t transformed_width = static_cast(output_shape[0] * + ((output_shape[1] + block_size - 1) / block_size) * + ((output_shape[2] + block_size - 1) / block_size)); + return (transformed_width < opencl_image_max_size[0] && + static_cast(sqr_block * in_channels) + < opencl_image_max_size[1] && + static_cast(sqr_block * out_channels) + < opencl_image_max_size[1]); + }; + // GPU only supports 4x4 and 2x2 gpu winograd convolution + if (*wino_blk_size == 4) { + // if block size == 4 exceed OpenCL image size limitation, fallback to 2 + if (!check_opencl_limit(4)) { + *wino_blk_size = 2; + } else { + return true; + } + } + return check_opencl_limit(2); +} + +MaceStatus Conv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + const int wino_blk_size, + Tensor *output) { + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + if (strides[0] != strides[1] || + (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) { + LOG(WARNING) << "OpenCL conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides[0] << "x" << strides[1] + << ",dilations " << dilations[0] << "x" << dilations[1] + << " is not implemented yet."; + MACE_NOT_IMPLEMENTED; + } + + // Reshape output + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter->shape().data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + std::function conv_func; + + if (wino_blk_size != 0) { + // use winograd covolution + conv_func = [&]() -> MaceStatus { + cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]}; + uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]}; + return WinogradConv2dK3x3S1(context, + kernels, + input, + filter, + bias, + paddings.data(), + activation, + relux_max_limit, + leakyrelu_coefficient, + wino_blk_size, + &input_shape_, + output, + kwg_size); + }; + } else if (kernel_h == 1 && kernel_w == 1) { + conv_func = [&]() -> MaceStatus { + return Conv2dK1x1(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + leakyrelu_coefficient, + &input_shape_, + output, + &kwg_size_[0]); + }; + } else if (kernel_h == 3 && kernel_w == 3) { + conv_func = [&]() -> MaceStatus { + return Conv2dK3x3(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + leakyrelu_coefficient, + &input_shape_, + output, + &kwg_size_[0]); + }; + } else { + conv_func = [&]() -> MaceStatus { + return Conv2d(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + leakyrelu_coefficient, + &input_shape_, + output, + &kwg_size_[0]); + }; + } + + return conv_func(); +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h index 5df0847b5e1d4160c4484ccb06d6118df4bb70b0..84fae55dff77afef2056f3f8e1628413a73e0bc2 100644 --- a/mace/ops/opencl/image/conv_2d.h +++ b/mace/ops/opencl/image/conv_2d.h @@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size); @@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size); @@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size); @@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, const int wino_blk_size, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size[3]); -template class Conv2dKernel : public OpenCLConv2dKernel { public: bool CheckUseWinograd( @@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel { std::vector input_shape_; }; -template -bool Conv2dKernel::CheckUseWinograd( - OpenCLRuntime *runtime, - const std::vector &filter_shape, - const std::vector &output_shape, - const int *strides, - const int *dilations, - int *wino_blk_size) { - if (filter_shape[2] != 3 || filter_shape[3] != 3 || - strides[0] > 1 || strides[1] > 1 || - dilations[0] > 1 || dilations[1] > 1) { - return false; - } - index_t out_channels = filter_shape[0]; - index_t in_channels = filter_shape[1]; - auto opencl_image_max_size = runtime->GetMaxImage2DSize(); - auto check_opencl_limit = [&](int block_size) -> bool { - int sqr_block = (block_size + 2) * (block_size + 2); - uint64_t transformed_width = static_cast(output_shape[0] * - ((output_shape[1] + block_size - 1) / block_size) * - ((output_shape[2] + block_size - 1) / block_size)); - return (transformed_width < opencl_image_max_size[0] && - static_cast(sqr_block * in_channels) - < opencl_image_max_size[1] && - static_cast(sqr_block * out_channels) - < opencl_image_max_size[1]); - }; - // GPU only supports 4x4 and 2x2 gpu winograd convolution - if (*wino_blk_size == 4) { - // if block size == 4 exceed OpenCL image size limitation, fallback to 2 - if (!check_opencl_limit(4)) { - *wino_blk_size = 2; - } else { - return true; - } - } - return check_opencl_limit(2); -} - -template -MaceStatus Conv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const int wino_blk_size, - Tensor *output) { - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (strides[0] != strides[1] || - (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) { - LOG(WARNING) << "OpenCL conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides[0] << "x" << strides[1] - << ",dilations " << dilations[0] << "x" << dilations[1] - << " is not implemented yet."; - MACE_NOT_IMPLEMENTED; - } - - // Reshape output - std::vector output_shape(4); - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), filter->shape().data(), - padding_data.data(), dilations, strides, RoundType::FLOOR, - output_shape.data()); - } - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - std::function conv_func; - - if (wino_blk_size != 0) { - // use winograd covolution - conv_func = [&]() -> MaceStatus { - cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]}; - uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]}; - return WinogradConv2dK3x3S1(context, - kernels, - input, - filter, - bias, - paddings.data(), - activation, - relux_max_limit, - leakyrelu_coefficient, - DataTypeToEnum::value, - wino_blk_size, - &input_shape_, - output, - kwg_size); - }; - } else if (kernel_h == 1 && kernel_w == 1) { - conv_func = [&]() -> MaceStatus { - return Conv2dK1x1(context, - &kernels_[0], - input, - filter, - bias, - strides[0], - paddings.data(), - dilations, - activation, - relux_max_limit, - leakyrelu_coefficient, - DataTypeToEnum::value, - &input_shape_, - output, - &kwg_size_[0]); - }; - } else if (kernel_h == 3 && kernel_w == 3) { - conv_func = [&]() -> MaceStatus { - return Conv2dK3x3(context, - &kernels_[0], - input, - filter, - bias, - strides[0], - paddings.data(), - dilations, - activation, - relux_max_limit, - leakyrelu_coefficient, - DataTypeToEnum::value, - &input_shape_, - output, - &kwg_size_[0]); - }; - } else { - conv_func = [&]() -> MaceStatus { - return Conv2d(context, - &kernels_[0], - input, - filter, - bias, - strides[0], - paddings.data(), - dilations, - activation, - relux_max_limit, - leakyrelu_coefficient, - DataTypeToEnum::value, - &input_shape_, - output, - &kwg_size_[0]); - }; - } - - return conv_func(); -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc index 374d262ae34a4938e40f94dd941e95735bcedd4e..460d01323dccd584b880f2cdc27b5d2e4c2735fe 100644 --- a/mace/ops/opencl/image/conv_2d_1x1.cc +++ b/mace/ops/opencl/image/conv_2d_1x1.cc @@ -66,21 +66,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dK1x1(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +MaceStatus Conv2dK1x1(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { MACE_UNUSED(padding); MACE_UNUSED(dilations); const index_t batch = output->dim(0); @@ -106,31 +105,38 @@ extern MaceStatus Conv2dK1x1(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } switch (activation) { - case NOOP: + case NOOP: { break; - case RELU: + } + case RELU: { built_options.emplace("-DUSE_RELU"); break; - case RELUX: + } + case RELUX: { built_options.emplace("-DUSE_RELUX"); break; - case TANH: + } + case TANH: { built_options.emplace("-DUSE_TANH"); break; - case SIGMOID: + } + case SIGMOID: { built_options.emplace("-DUSE_SIGMOID"); break; - case LEAKYRELU: + } + case LEAKYRELU: { built_options.emplace("-DUSE_LEAKYRELU"); break; - default: + } + default: { LOG(FATAL) << "Unknown activation type: " << activation; + } } MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name, diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc index 125a973ae7de4409b31fa2a716c35409d5955d0e..a3bd170f64079a5b4533dd2a4fb104dbee752cfd 100644 --- a/mace/ops/opencl/image/conv_2d_3x3.cc +++ b/mace/ops/opencl/image/conv_2d_3x3.cc @@ -59,21 +59,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dK3x3(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +MaceStatus Conv2dK3x3(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -93,29 +92,36 @@ extern MaceStatus Conv2dK3x3(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { - case NOOP: + case NOOP: { break; - case RELU: + } + case RELU: { built_options.emplace("-DUSE_RELU"); break; - case RELUX: + } + case RELUX: { built_options.emplace("-DUSE_RELUX"); break; - case TANH: + } + case TANH: { built_options.emplace("-DUSE_TANH"); break; - case SIGMOID: + } + case SIGMOID: { built_options.emplace("-DUSE_SIGMOID"); break; - case LEAKYRELU: + } + case LEAKYRELU: { built_options.emplace("-DUSE_LEAKYRELU"); break; - default: + } + default: { LOG(FATAL) << "Unknown activation type: " << activation; + } } MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name, diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc index 7f0250cbc4ebc73cfa52c6041c9da8c95b7e3892..e1979c03a715a8ec0a74bf26d35e3f34484d0c55 100644 --- a/mace/ops/opencl/image/conv_2d_general.cc +++ b/mace/ops/opencl/image/conv_2d_general.cc @@ -67,21 +67,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2d(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +MaceStatus Conv2d(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -101,29 +100,36 @@ extern MaceStatus Conv2d(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { - case NOOP: + case NOOP: { break; - case RELU: + } + case RELU: { built_options.emplace("-DUSE_RELU"); break; - case RELUX: + } + case RELUX: { built_options.emplace("-DUSE_RELUX"); break; - case TANH: + } + case TANH: { built_options.emplace("-DUSE_TANH"); break; - case SIGMOID: + } + case SIGMOID: { built_options.emplace("-DUSE_SIGMOID"); break; - case LEAKYRELU: + } + case LEAKYRELU: { built_options.emplace("-DUSE_LEAKYRELU"); break; - default: + } + default: { LOG(FATAL) << "Unknown activation type: " << activation; + } } MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name, diff --git a/mace/ops/opencl/image/crop.cc b/mace/ops/opencl/image/crop.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad4e703d6f712e699ff6c73296e04559779e5d60 --- /dev/null +++ b/mace/ops/opencl/image/crop.cc @@ -0,0 +1,117 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/crop.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus CropKernel::Compute( + OpContext *context, + const std::vector &input_list, + Tensor *output) { + const int32_t inputs_count = static_cast(input_list.size()); + MACE_CHECK(inputs_count >= 2) + << "Crop opencl kernel only support 2 elements input"; + const Tensor *input0 = input_list[0]; + const Tensor *input1 = input_list[1]; + const uint32_t in0_dims = static_cast(input0->dim_size()); + const uint32_t in1_dims = static_cast(input0->dim_size()); + MACE_CHECK(in0_dims == 4 && in1_dims == 4, + "Crop op only supports 4-dims inputs now."); + + std::vector offsets(4, 0); + + std::vector output_shape(input0->shape()); + for (index_t i = 0; i < in0_dims; ++i) { + if (offset_[i] >= 0) { + output_shape[i] = input1->dim(i); + offsets[i] = offset_[i]; + MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i)) + << "the crop for dimension " << i + << " is out of bound, first input size " + << input0->dim(i) << ", offset " << offsets[i] + << ", second input size " << input1->dim(i); + } + } + MACE_CHECK(offsets[3] % 4 == 0, + "MACE opencl only supports cropping channel" + " offset divisible by 4."); + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + const index_t offset_chan_blk = RoundUpDiv4(offsets[3]); + const index_t channel_blk = RoundUpDiv4(output->dim(3)); + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)) + }; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); + built_options.emplace("-Dcrop=" + kernel_name); + auto dt = input0->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input0->opencl_image())); + kernel_.setArg(idx++, static_cast(offsets[0])); + kernel_.setArg(idx++, static_cast(offsets[1])); + kernel_.setArg(idx++, static_cast(offsets[2])); + kernel_.setArg(idx++, static_cast(offset_chan_blk)); + kernel_.setArg(idx++, static_cast(input0->dim(1))); + kernel_.setArg(idx++, static_cast(input0->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input0->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("crop_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h index d121b76339239ffd3964a9b756e6bebfaa838d48..c2f1c53aa2383ab89669be2520a9af3c1f2a27c8 100644 --- a/mace/ops/opencl/image/crop.h +++ b/mace/ops/opencl/image/crop.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class CropKernel : public OpenCLCropKernel { public: explicit CropKernel( @@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel { std::vector input_shape_; }; -template -MaceStatus CropKernel::Compute( - OpContext *context, - const std::vector &input_list, - Tensor *output) { - const int32_t inputs_count = static_cast(input_list.size()); - MACE_CHECK(inputs_count >= 2) - << "Crop opencl kernel only support 2 elements input"; - const Tensor *input0 = input_list[0]; - const Tensor *input1 = input_list[1]; - const uint32_t in0_dims = static_cast(input0->dim_size()); - const uint32_t in1_dims = static_cast(input0->dim_size()); - MACE_CHECK(in0_dims == 4 && in1_dims == 4, - "Crop op only supports 4-dims inputs now."); - - std::vector offsets(4, 0); - - std::vector output_shape(input0->shape()); - for (index_t i = 0; i < in0_dims; ++i) { - if (offset_[i] >= 0) { - output_shape[i] = input1->dim(i); - offsets[i] = offset_[i]; - MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i)) - << "the crop for dimension " << i - << " is out of bound, first input size " - << input0->dim(i) << ", offset " << offsets[i] - << ", second input size " << input1->dim(i); - } - } - MACE_CHECK(offsets[3] % 4 == 0, - "MACE opencl only supports cropping channel" - " offset divisible by 4."); - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - const index_t offset_chan_blk = RoundUpDiv4(offsets[3]); - const index_t channel_blk = RoundUpDiv4(output->dim(3)); - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(output->dim(2)), - static_cast(output->dim(0) * output->dim(1)) - }; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); - built_options.emplace("-Dcrop=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input0->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input0->opencl_image())); - kernel_.setArg(idx++, static_cast(offsets[0])); - kernel_.setArg(idx++, static_cast(offsets[1])); - kernel_.setArg(idx++, static_cast(offsets[2])); - kernel_.setArg(idx++, static_cast(offset_chan_blk)); - kernel_.setArg(idx++, static_cast(input0->dim(1))); - kernel_.setArg(idx++, static_cast(input0->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(1))); - kernel_.setArg(idx++, static_cast(output->dim(2))); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input0->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("crop_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/deconv_2d.cc b/mace/ops/opencl/image/deconv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..0509fcf005dc9abf20ad241cf45e8e3cd755a1c7 --- /dev/null +++ b/mace/ops/opencl/image/deconv_2d.cc @@ -0,0 +1,158 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/deconv_2d.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus Deconv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + const std::vector &output_shape, + Tensor *output) { + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t input_channel_blocks = RoundUpDiv4(input_channels); + const int stride_h = strides[0]; + const int stride_w = strides[1]; + MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); + const int width_tile = 5; + const index_t n_strides = (width + stride_w - 1) / stride_w; + const index_t width_blocks = + ((n_strides + width_tile - 1) / width_tile) * stride_w; + const float stride_h_r = 1.f / static_cast(stride_h); + const float stride_w_r = 1.f / static_cast(stride_w); + const int padding_h = (padding_data[0] + 1) >> 1; + const int padding_w = (padding_data[1] + 1) >> 1; + + const int align_h = stride_h - 1 - padding_h; + const int align_w = stride_w - 1 - padding_w; + const int kernel_size = filter->dim(2) * filter->dim(3); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); + built_options.emplace("-Ddeconv_2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + case LEAKYRELU: + built_options.emplace("-DUSE_LEAKYRELU"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(filter->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit); + kernel_.setArg(idx++, leakyrelu_coefficient); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(stride_h)); + kernel_.setArg(idx++, static_cast(stride_w)); + kernel_.setArg(idx++, stride_h_r); + kernel_.setArg(idx++, stride_w_r); + kernel_.setArg(idx++, static_cast(align_h)); + kernel_.setArg(idx++, static_cast(align_w)); + kernel_.setArg(idx++, static_cast(padding_h)); + kernel_.setArg(idx++, static_cast(padding_w)); + kernel_.setArg(idx++, static_cast(filter->dim(2))); + kernel_.setArg(idx++, static_cast(filter->dim(3))); + kernel_.setArg(idx++, static_cast(kernel_size)); + kernel_.setArg(idx++, static_cast(input_channel_blocks)); + kernel_.setArg(idx++, static_cast(channel_blocks)); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("deconv2d_opencl_kernel_", activation, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h index 058cc094ac1874bd1c72a588bc6215670daed74b..aa3b9d249b58cd3982866c71ef07b30ee24c75bc 100644 --- a/mace/ops/opencl/image/deconv_2d.h +++ b/mace/ops/opencl/image/deconv_2d.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class Deconv2dKernel : public OpenCLDeconv2dKernel { public: MaceStatus Compute( @@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel { std::vector input_shape_; }; -template -MaceStatus Deconv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const int *padding_data, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const std::vector &output_shape, - Tensor *output) { - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - const DataType dt = DataTypeToEnum::value; - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - const index_t input_channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t input_channel_blocks = RoundUpDiv4(input_channels); - const int stride_h = strides[0]; - const int stride_w = strides[1]; - MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); - const int width_tile = 5; - const index_t n_strides = (width + stride_w - 1) / stride_w; - const index_t width_blocks = - ((n_strides + width_tile - 1) / width_tile) * stride_w; - const float stride_h_r = 1.f / static_cast(stride_h); - const float stride_w_r = 1.f / static_cast(stride_w); - const int padding_h = (padding_data[0] + 1) >> 1; - const int padding_w = (padding_data[1] + 1) >> 1; - - const int align_h = stride_h - 1 - padding_h; - const int align_w = stride_w - 1 - padding_w; - const int kernel_size = filter->dim(2) * filter->dim(3); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); - built_options.emplace("-Ddeconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - case LEAKYRELU: - built_options.emplace("-DUSE_LEAKYRELU"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(filter->opencl_image())); - if (bias != nullptr) { - kernel_.setArg(idx++, *(bias->opencl_image())); - } - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, relux_max_limit); - kernel_.setArg(idx++, leakyrelu_coefficient); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(input->dim(3))); - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, static_cast(stride_h)); - kernel_.setArg(idx++, static_cast(stride_w)); - kernel_.setArg(idx++, stride_h_r); - kernel_.setArg(idx++, stride_w_r); - kernel_.setArg(idx++, static_cast(align_h)); - kernel_.setArg(idx++, static_cast(align_w)); - kernel_.setArg(idx++, static_cast(padding_h)); - kernel_.setArg(idx++, static_cast(padding_w)); - kernel_.setArg(idx++, static_cast(filter->dim(2))); - kernel_.setArg(idx++, static_cast(filter->dim(3))); - kernel_.setArg(idx++, static_cast(kernel_size)); - kernel_.setArg(idx++, static_cast(input_channel_blocks)); - kernel_.setArg(idx++, static_cast(channel_blocks)); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("deconv2d_opencl_kernel_", activation, output->dim(0), - output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/depth_to_space.cc b/mace/ops/opencl/image/depth_to_space.cc new file mode 100644 index 0000000000000000000000000000000000000000..b885dddfc93316f3100e7478c3c54246171cafbf --- /dev/null +++ b/mace/ops/opencl/image/depth_to_space.cc @@ -0,0 +1,120 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/depth_to_space.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus DepthToSpaceKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_depth = input->dim(3); + + MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + "input depth should be dividable by block_size * block_size ", + input_depth); + + const index_t output_height = input_height * block_size_; + const index_t output_width = input_width * block_size_; + const index_t output_depth = input_depth / (block_size_ * block_size_); + MACE_CHECK(output_depth % 4 == 0 || output_depth < 4, + "output channel not support:") << output_depth; + + std::vector output_shape = {batch, + output_height, + output_width, + output_depth}; + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + uint32_t gws[3]; + if (output_depth < 3) { + gws[0] = static_cast(RoundUpDiv4(input_depth)); + gws[1] = static_cast(input_width); + gws[2] = static_cast(input_height * batch); + } else { + gws[0] = static_cast(RoundUpDiv4(output_depth)); + gws[1] = static_cast(output_width); + gws[2] = static_cast(output_height * batch); + } + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + const char *kernel_name = "depth_to_space"; + if (output_depth < 4) { + built_options.emplace(MakeString("-DDEPTH", output_depth)); + if (output_depth != 3) kernel_name = "depth_to_space_d1_d2"; + } + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(input_height)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(output_height)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + std::string tuning_key = Concat("depth_to_space", + batch, output_height, + output_width, output_depth); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h index 990e06ccef6771b2d7ab8a4e8bb31446e7feeb40..ac68cbdb02adffad9b7bfc911363b1c95d2a7a86 100644 --- a/mace/ops/opencl/image/depth_to_space.h +++ b/mace/ops/opencl/image/depth_to_space.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { public: explicit DepthToSpaceKernel(const int block_size) @@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { std::vector input_shape_; }; -template -MaceStatus DepthToSpaceKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t input_height = input->dim(1); - const index_t input_width = input->dim(2); - const index_t input_depth = input->dim(3); - - MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, - "input depth should be dividable by block_size * block_size ", - input_depth); - - const index_t output_height = input_height * block_size_; - const index_t output_width = input_width * block_size_; - const index_t output_depth = input_depth / (block_size_ * block_size_); - MACE_CHECK(output_depth % 4 == 0 || output_depth < 4, - "output channel not support:") << output_depth; - - std::vector output_shape = {batch, - output_height, - output_width, - output_depth}; - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - uint32_t gws[3]; - if (output_depth < 3) { - gws[0] = static_cast(RoundUpDiv4(input_depth)); - gws[1] = static_cast(input_width); - gws[2] = static_cast(input_height * batch); - } else { - gws[0] = static_cast(RoundUpDiv4(output_depth)); - gws[1] = static_cast(output_width); - gws[2] = static_cast(output_height * batch); - } - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - const char *kernel_name = "depth_to_space"; - if (output_depth < 4) { - built_options.emplace(MakeString("-DDEPTH", output_depth)); - if (output_depth != 3) kernel_name = "depth_to_space_d1_d2"; - } - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", - obfuscated_kernel_name, - built_options, - &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(input_height)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(output_height)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - std::string tuning_key = Concat("depth_to_space", - batch, output_height, - output_width, output_depth); - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/depthwise_conv2d.cc b/mace/ops/opencl/image/depthwise_conv2d.cc index 5b86c68414d46517c48382142193187f600efe7b..0101ea136a8b14d825c9d6ec89c074e0d005f01b 100644 --- a/mace/ops/opencl/image/depthwise_conv2d.cc +++ b/mace/ops/opencl/image/depthwise_conv2d.cc @@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size) { @@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(MakeString("-DSTRIDE=", stride)); switch (activation) { @@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context, } } // namespace depthwise + + +MaceStatus DepthwiseConv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + Tensor *output) { + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + if (strides[0] != strides[1]) { + LOG(WARNING) << "OpenCL depthwise conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides[0] << "x" << strides[1] + << " is not implemented yet, using slow version"; + MACE_NOT_IMPLEMENTED; + } + + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->dim(0) * filter->dim(1); + fake_filter_shape[1] = filter->dim(1); + fake_filter_shape[2] = filter->dim(2); + fake_filter_shape[3] = filter->dim(3); + + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), fake_filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + return depthwise::DepthwiseConv2d( + context, &kernel_, input, filter, bias, strides[0], paddings.data(), + dilations, activation, relux_max_limit, leakyrelu_coefficient, + &input_shape_, output, &kwg_size_); +} + } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h index 13a64076f9200d9f180159b5bd4455aaf316db99..f4bc4f2a0c92adc2edca4c5eb820c2f00f63d680 100644 --- a/mace/ops/opencl/image/depthwise_conv2d.h +++ b/mace/ops/opencl/image/depthwise_conv2d.h @@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, std::vector *prev_input_shape, Tensor *output, uint32_t *kwg_size); } // namespace depthwise - -template class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { public: MaceStatus Compute( @@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { std::vector input_shape_; }; -template -MaceStatus DepthwiseConv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - Tensor *output) { - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (strides[0] != strides[1]) { - LOG(WARNING) << "OpenCL depthwise conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides[0] << "x" << strides[1] - << " is not implemented yet, using slow version"; - MACE_NOT_IMPLEMENTED; - } - - // Create a fake conv_2d filter to calculate the paddings and output size - std::vector fake_filter_shape(4); - fake_filter_shape[0] = filter->dim(0) * filter->dim(1); - fake_filter_shape[1] = filter->dim(1); - fake_filter_shape[2] = filter->dim(2); - fake_filter_shape[3] = filter->dim(3); - - std::vector output_shape(4); - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), fake_filter_shape.data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), fake_filter_shape.data(), - padding_data.data(), dilations, strides, RoundType::FLOOR, - output_shape.data()); - } - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - return depthwise::DepthwiseConv2d( - context, &kernel_, input, filter, bias, strides[0], paddings.data(), - dilations, activation, relux_max_limit, leakyrelu_coefficient, - DataTypeToEnum::value, &input_shape_, output, &kwg_size_); -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/depthwise_deconv2d.cc b/mace/ops/opencl/image/depthwise_deconv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a8d7eb9919959cc63bcc01f127344fb72ee8af5 --- /dev/null +++ b/mace/ops/opencl/image/depthwise_deconv2d.cc @@ -0,0 +1,165 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/depthwise_deconv2d.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus DepthwiseDeconv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const int group, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + const std::vector &output_shape, + Tensor *output) { + const index_t batch = output_shape[0]; + const index_t height = output_shape[1]; + const index_t width = output_shape[2]; + const index_t channels = output_shape[3]; + const index_t input_channels = input->dim(3); + const index_t multiplier = filter->dim(0); + + MACE_CHECK(group == channels && group == input_channels && multiplier == 1, + "opencl image deconv only supports depthwise type group."); + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + const index_t channel_blocks = RoundUpDiv4(channels); + const int stride_h = strides[0]; + const int stride_w = strides[1]; + MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); + const int width_tile = 5; + const index_t n_strides = (width + stride_w - 1) / stride_w; + const index_t width_blocks = + ((n_strides + width_tile - 1) / width_tile) * stride_w; + const float stride_h_r = 1.f / static_cast(stride_h); + const float stride_w_r = 1.f / static_cast(stride_w); + const int padding_h = (padding_data[0] + 1) >> 1; + const int padding_w = (padding_data[1] + 1) >> 1; + + const int align_h = stride_h - 1 - padding_h; + const int align_w = stride_w - 1 - padding_w; + const int kernel_size = filter->dim(2) * filter->dim(3); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d"); + built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + case LEAKYRELU: + built_options.emplace("-DUSE_LEAKYRELU"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(filter->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit); + kernel_.setArg(idx++, leakyrelu_coefficient); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(stride_h)); + kernel_.setArg(idx++, static_cast(stride_w)); + kernel_.setArg(idx++, stride_h_r); + kernel_.setArg(idx++, stride_w_r); + kernel_.setArg(idx++, static_cast(align_h)); + kernel_.setArg(idx++, static_cast(align_w)); + kernel_.setArg(idx++, static_cast(padding_h)); + kernel_.setArg(idx++, static_cast(padding_w)); + kernel_.setArg(idx++, static_cast(filter->dim(2))); + kernel_.setArg(idx++, static_cast(filter->dim(3))); + kernel_.setArg(idx++, static_cast(kernel_size)); + kernel_.setArg(idx++, static_cast(channel_blocks)); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("depthwise_deconv2d_kernel_", + activation, + output->dim(0), + output->dim(1), + output->dim(2), + output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h index 53d0536fe85b220f635d992d03c065e409a14df0..2055511678d8340da655b298c2a8a163279c95a3 100644 --- a/mace/ops/opencl/image/depthwise_deconv2d.h +++ b/mace/ops/opencl/image/depthwise_deconv2d.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel { public: MaceStatus Compute( @@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel { std::vector input_shape_; }; -template -MaceStatus DepthwiseDeconv2dKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const int *padding_data, - const int group, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - const std::vector &output_shape, - Tensor *output) { - const index_t batch = output_shape[0]; - const index_t height = output_shape[1]; - const index_t width = output_shape[2]; - const index_t channels = output_shape[3]; - const index_t input_channels = input->dim(3); - const index_t multiplier = filter->dim(0); - - MACE_CHECK(group == channels && group == input_channels && multiplier == 1, - "opencl image deconv only supports depthwise type group."); - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - const DataType dt = DataTypeToEnum::value; - - const index_t channel_blocks = RoundUpDiv4(channels); - const int stride_h = strides[0]; - const int stride_w = strides[1]; - MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); - const int width_tile = 5; - const index_t n_strides = (width + stride_w - 1) / stride_w; - const index_t width_blocks = - ((n_strides + width_tile - 1) / width_tile) * stride_w; - const float stride_h_r = 1.f / static_cast(stride_h); - const float stride_w_r = 1.f / static_cast(stride_w); - const int padding_h = (padding_data[0] + 1) >> 1; - const int padding_w = (padding_data[1] + 1) >> 1; - - const int align_h = stride_h - 1 - padding_h; - const int align_w = stride_w - 1 - padding_w; - const int kernel_size = filter->dim(2) * filter->dim(3); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d"); - built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - case LEAKYRELU: - built_options.emplace("-DUSE_LEAKYRELU"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(filter->opencl_image())); - if (bias != nullptr) { - kernel_.setArg(idx++, *(bias->opencl_image())); - } - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, relux_max_limit); - kernel_.setArg(idx++, leakyrelu_coefficient); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, static_cast(stride_h)); - kernel_.setArg(idx++, static_cast(stride_w)); - kernel_.setArg(idx++, stride_h_r); - kernel_.setArg(idx++, stride_w_r); - kernel_.setArg(idx++, static_cast(align_h)); - kernel_.setArg(idx++, static_cast(align_w)); - kernel_.setArg(idx++, static_cast(padding_h)); - kernel_.setArg(idx++, static_cast(padding_w)); - kernel_.setArg(idx++, static_cast(filter->dim(2))); - kernel_.setArg(idx++, static_cast(filter->dim(3))); - kernel_.setArg(idx++, static_cast(kernel_size)); - kernel_.setArg(idx++, static_cast(channel_blocks)); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("depthwise_deconv2d_kernel_", - activation, - output->dim(0), - output->dim(1), - output->dim(2), - output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/eltwise.cc b/mace/ops/opencl/image/eltwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..437cfce0255f8bcb337242612b4ec08a3c4bfe85 --- /dev/null +++ b/mace/ops/opencl/image/eltwise.cc @@ -0,0 +1,168 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/eltwise.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus EltwiseKernel::Compute( + OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) { + bool swapped = false; + std::string input1_type = ""; + if (input1 == nullptr) { + input1_type = "INPUT_SCALAR"; + } else { + MACE_CHECK((input0->dim_size() == input1->dim_size() + && input0->dim_size() == 4) || + input0->dim_size() == 1 || input1->dim_size() == 1) + << "Inputs of Eltwise op must be same shape or fulfill broadcast logic"; + MACE_CHECK(type_ != EltwiseType::EQUAL) + << "Eltwise op on GPU does not support EQUAL"; + // broadcast + if (input0->size() != input1->size() || + input0->dim_size() != input1->dim_size()) { + if (input0->size() < input1->size() + || input0->dim_size() < input1->dim_size()) { + std::swap(input0, input1); + swapped = true; + } + if (input1->dim_size() == 1 + || (input1->dim(0) == 1 && input1->dim(1) == 1 + && input1->dim(2) == 1)) { + // Tensor-Vector element wise + if (input0->dim(3) == input1->dim(input1->dim_size()-1)) { + input1_type = "INPUT_VECTOR"; + } else { + LOG(FATAL) << "Inputs not match the broadcast logic, " + << MakeString(input0->shape()) << " vs " + << MakeString(input1->shape()); + } + } else { // must be 4-D + if (input0->dim(0) == input1->dim(0) + && input1->dim(1) == 1 + && input1->dim(2) == 1 + && input0->dim(3) == input1->dim(3)) { + input1_type = "INPUT_BATCH_VECTOR"; + } else if (input0->dim(0) == input1->dim(0) + && input0->dim(1) == input1->dim(1) + && input0->dim(2) == input1->dim(2) + && input1->dim(3) == 1) { + // broadcast on channel dimension + input1_type = "INPUT_TENSOR_BC_CHAN"; + } else { + LOG(FATAL) << "Element-Wise op only support broadcast on" + " channel dimension:" + "Tensor-BatchVector(4D-[N,1,1,C]) " + "and Tensor-Tensor(4D-[N,H,W,1]). but got " + << MakeString(input0->shape()) << " vs " + << MakeString(input1->shape()); + } + } + } + } + + if (scalar_input_index_ == 0) { + swapped = !swapped; + } + + std::vector output_shape(4); + output_shape[0] = input0->dim(0); + output_shape[1] = input0->dim(1); + output_shape[2] = input0->dim(2); + output_shape[3] = input0->dim(3); + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t batch_height_pixels = batch * height; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(batch_height_pixels)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); + built_options.emplace("-Deltwise=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + if (!input1_type.empty()) { + built_options.emplace("-D" + input1_type); + } + if (swapped) built_options.emplace("-DSWAPPED"); + if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR"); + if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input0->opencl_image())); + if (input1 == nullptr) { + kernel_.setArg(idx++, scalar_input_); + } else { + kernel_.setArg(idx++, *(input1->opencl_image())); + } + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + if (!coeff_.empty()) { + kernel_.setArg(idx++, coeff_[0]); + kernel_.setArg(idx++, coeff_[1]); + } + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input0->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h index 9c8a1a3133e63d7e8c486ca292f86f0fa2b981db..5678f9c72ffbe2ec706e2b44bd73457f938cb585 100644 --- a/mace/ops/opencl/image/eltwise.h +++ b/mace/ops/opencl/image/eltwise.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/eltwise.h" +#include "mace/ops/common/eltwise_type.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -32,7 +32,6 @@ namespace ops { namespace opencl { namespace image { -template class EltwiseKernel : public OpenCLEltwiseKernel { public: explicit EltwiseKernel( @@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel { std::vector input_shape_; }; -template -MaceStatus EltwiseKernel::Compute( - OpContext *context, - const Tensor *input0, - const Tensor *input1, - Tensor *output) { - bool swapped = false; - std::string input1_type = ""; - if (input1 == nullptr) { - input1_type = "INPUT_SCALAR"; - } else { - MACE_CHECK((input0->dim_size() == input1->dim_size() - && input0->dim_size() == 4) || - input0->dim_size() == 1 || input1->dim_size() == 1) - << "Inputs of Eltwise op must be same shape or fulfill broadcast logic"; - MACE_CHECK(type_ != EltwiseType::EQUAL) - << "Eltwise op on GPU does not support EQUAL"; - // broadcast - if (input0->size() != input1->size() || - input0->dim_size() != input1->dim_size()) { - if (input0->size() < input1->size() - || input0->dim_size() < input1->dim_size()) { - std::swap(input0, input1); - swapped = true; - } - if (input1->dim_size() == 1 - || (input1->dim(0) == 1 && input1->dim(1) == 1 - && input1->dim(2) == 1)) { - // Tensor-Vector element wise - if (input0->dim(3) == input1->dim(input1->dim_size()-1)) { - input1_type = "INPUT_VECTOR"; - } else { - LOG(FATAL) << "Inputs not match the broadcast logic, " - << MakeString(input0->shape()) << " vs " - << MakeString(input1->shape()); - } - } else { // must be 4-D - if (input0->dim(0) == input1->dim(0) - && input1->dim(1) == 1 - && input1->dim(2) == 1 - && input0->dim(3) == input1->dim(3)) { - input1_type = "INPUT_BATCH_VECTOR"; - } else if (input0->dim(0) == input1->dim(0) - && input0->dim(1) == input1->dim(1) - && input0->dim(2) == input1->dim(2) - && input1->dim(3) == 1) { - // broadcast on channel dimension - input1_type = "INPUT_TENSOR_BC_CHAN"; - } else { - LOG(FATAL) << "Element-Wise op only support broadcast on" - " channel dimension:" - "Tensor-BatchVector(4D-[N,1,1,C]) " - "and Tensor-Tensor(4D-[N,H,W,1]). but got " - << MakeString(input0->shape()) << " vs " - << MakeString(input1->shape()); - } - } - } - } - - if (scalar_input_index_ == 0) { - swapped = !swapped; - } - - std::vector output_shape(4); - output_shape[0] = input0->dim(0); - output_shape[1] = input0->dim(1); - output_shape[2] = input0->dim(2); - output_shape[3] = input0->dim(3); - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t batch_height_pixels = batch * height; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(batch_height_pixels)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); - built_options.emplace("-Deltwise=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); - if (!input1_type.empty()) { - built_options.emplace("-D" + input1_type); - } - if (swapped) built_options.emplace("-DSWAPPED"); - if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR"); - if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input0->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input0->opencl_image())); - if (input1 == nullptr) { - kernel_.setArg(idx++, scalar_input_); - } else { - kernel_.setArg(idx++, *(input1->opencl_image())); - } - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(channels)); - if (!coeff_.empty()) { - kernel_.setArg(idx++, coeff_[0]); - kernel_.setArg(idx++, coeff_[1]); - } - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input0->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/fully_connected.cc b/mace/ops/opencl/image/fully_connected.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ec83e91b771d49abc379d4aa312dd5caa90ac18 --- /dev/null +++ b/mace/ops/opencl/image/fully_connected.cc @@ -0,0 +1,162 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/fully_connected.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + + +MaceStatus FullyConnectedKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *weight, + const Tensor *bias, + const ActivationType activation, + const float relux_max_limit, + const float leakyrelu_coefficient, + Tensor *output) { + std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const index_t batch = output->dim(0); + const index_t output_size = output->dim(3); + const index_t output_blocks = RoundUpDiv4(output_size); + + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); + built_options.emplace("-Dfully_connected_width=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + if (bias != nullptr) { + built_options.emplace("-DBIAS"); + } + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + case LEAKYRELU: + built_options.emplace("-DUSE_LEAKYRELU"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, + built_options, &kernel_)); + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(kernel_)); + + gws_ = {4, (wave_size / 4), static_cast(batch * output_blocks)}; + + const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); + lws_ = {gws_[0], gws_[1], inter_local_blks}; + } else { + gws_ = {4, 8, static_cast(batch * output_blocks)}; + + const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); + lws_ = {gws_[0], gws_[1], inter_local_blks}; + } + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + const index_t batch = output->dim(0); + const index_t output_blocks = RoundUpDiv4(output->dim(3)); + gws_[2] = static_cast(batch * output_blocks); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws_); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(weight->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)), + nullptr); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(input->dim(3)))); + kernel_.setArg(idx++, static_cast(output_blocks)); + kernel_.setArg(idx++, relux_max_limit); + kernel_.setArg(idx++, leakyrelu_coefficient); + + input_shape_ = input->shape(); + } + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]), + cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws_.size()); + for (size_t i = 0; i < lws_.size(); ++i) { + roundup_gws[i] = RoundUp(gws_[i], lws_[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); + } + MACE_OUT_OF_RANGE_VALIDATION; + MACE_CL_RET_STATUS(error); + + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h index 82386b7444396edb8084e3e28978cedc790e5ff7..9f1bae647f33c5906f31f312c5a094d64ef322e6 100644 --- a/mace/ops/opencl/image/fully_connected.h +++ b/mace/ops/opencl/image/fully_connected.h @@ -23,6 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" +#include "mace/ops/common/activation_type.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -30,7 +31,6 @@ namespace ops { namespace opencl { namespace image { -template class FullyConnectedKernel : public OpenCLFullyConnectedKernel { public: MaceStatus Compute( @@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel { std::vector input_shape_; }; -template -MaceStatus FullyConnectedKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *weight, - const Tensor *bias, - const ActivationType activation, - const float relux_max_limit, - const float leakyrelu_coefficient, - Tensor *output) { - std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - const index_t batch = output->dim(0); - const index_t output_size = output->dim(3); - const index_t output_blocks = RoundUpDiv4(output_size); - - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); - built_options.emplace("-Dfully_connected_width=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (bias != nullptr) { - built_options.emplace("-DBIAS"); - } - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - case LEAKYRELU: - built_options.emplace("-DUSE_LEAKYRELU"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, - built_options, &kernel_)); - - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(kernel_)); - - gws_ = {4, (wave_size / 4), static_cast(batch * output_blocks)}; - - const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); - lws_ = {gws_[0], gws_[1], inter_local_blks}; - } else { - gws_ = {4, 8, static_cast(batch * output_blocks)}; - - const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); - lws_ = {gws_[0], gws_[1], inter_local_blks}; - } - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - const index_t batch = output->dim(0); - const index_t output_blocks = RoundUpDiv4(output->dim(3)); - gws_[2] = static_cast(batch * output_blocks); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(weight->opencl_image())); - if (bias != nullptr) { - kernel_.setArg(idx++, *(bias->opencl_image())); - } - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)), - nullptr); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(RoundUpDiv4(input->dim(3)))); - kernel_.setArg(idx++, static_cast(output_blocks)); - kernel_.setArg(idx++, relux_max_limit); - kernel_.setArg(idx++, leakyrelu_coefficient); - - input_shape_ = input->shape(); - } - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]), - cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws_.size()); - for (size_t i = 0; i < lws_.size(); ++i) { - roundup_gws[i] = RoundUp(gws_[i], lws_[i]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); - } - MACE_OUT_OF_RANGE_VALIDATION; - MACE_CL_RET_STATUS(error); - - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/image_to_buffer.cc b/mace/ops/opencl/image/image_to_buffer.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a54ba740294c64ef4de270c31576abf7b9281dd --- /dev/null +++ b/mace/ops/opencl/image/image_to_buffer.cc @@ -0,0 +1,159 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/image_to_buffer.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ImageToBuffer::Compute(OpContext *context, + const Tensor *input, + const OpenCLBufferType type, + const int wino_blk_size, + Tensor *output) { + auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); + std::vector image_shape; + OpenCLUtil::CalImage2DShape(formatted_buffer_shape, + type, + &image_shape, + wino_blk_size); + MACE_RETURN_IF_ERROR(output->Resize(input->shape())); + + uint32_t gws[2] = {static_cast(image_shape[0]), + static_cast(image_shape[1])}; + std::string kernel_name; + switch (type) { + case CONV2D_FILTER:kernel_name = "filter_image_to_buffer"; + break; + case IN_OUT_CHANNEL:kernel_name = "in_out_image_to_buffer"; + break; + case ARGUMENT:kernel_name = "arg_image_to_buffer"; + break; + case IN_OUT_HEIGHT:kernel_name = "in_out_height_image_to_buffer"; + break; + case WINOGRAD_FILTER: { + std::stringstream ss_tmp; + gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); + ss_tmp << "winograd_filter_image_to_buffer_" + << wino_blk_size << "x" << wino_blk_size; + kernel_name = ss_tmp.str(); + break; + } + case WEIGHT_HEIGHT:kernel_name = "weight_height_image_to_buffer"; + break; + case WEIGHT_WIDTH:kernel_name = "weight_width_image_to_buffer"; + break; + case DW_CONV2D_FILTER: + case IN_OUT_WIDTH:LOG(FATAL) + << "IN_OUT_WIDTH only support buffer to image now"; + break; + } + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (output->dtype() == input->dtype()) { + auto data_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, + &kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(output->opencl_buffer())); + if (type == CONV2D_FILTER) { + const index_t + inner_size = output->dim(1) * output->dim(2) * output->dim(3); + kernel_.setArg(idx++, static_cast(output->dim(0))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(output->dim(0))); + } else if (type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(output->dim(0))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(3))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(input->opencl_image())); + input_shape_ = input->shape(); + } + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + const std::vector lws = {16, kwg_size / 16}; + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h index b91b8ba1fe07fcee684318169cf77aa543297c44..85893f6b283da5b659d30466568c65c52d931954 100644 --- a/mace/ops/opencl/image/image_to_buffer.h +++ b/mace/ops/opencl/image/image_to_buffer.h @@ -28,7 +28,6 @@ namespace ops { namespace opencl { namespace image { -template class ImageToBuffer : public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, @@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel { std::vector input_shape_; }; -template -MaceStatus ImageToBuffer::Compute(OpContext *context, - const Tensor *input, - const OpenCLBufferType type, - const int wino_blk_size, - Tensor *output) { - auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); - std::vector image_shape; - OpenCLUtil::CalImage2DShape(formatted_buffer_shape, - type, - &image_shape, - wino_blk_size); - MACE_RETURN_IF_ERROR(output->Resize(input->shape())); - - uint32_t gws[2] = {static_cast(image_shape[0]), - static_cast(image_shape[1])}; - std::string kernel_name; - switch (type) { - case CONV2D_FILTER: - kernel_name = "filter_image_to_buffer"; - break; - case IN_OUT_CHANNEL: - kernel_name = "in_out_image_to_buffer"; - break; - case ARGUMENT: - kernel_name = "arg_image_to_buffer"; - break; - case IN_OUT_HEIGHT: - kernel_name = "in_out_height_image_to_buffer"; - break; - case WINOGRAD_FILTER: { - std::stringstream ss_tmp; - gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); - ss_tmp << "winograd_filter_image_to_buffer_" - << wino_blk_size << "x" << wino_blk_size; - kernel_name = ss_tmp.str(); - break; - } - case WEIGHT_HEIGHT: - kernel_name = "weight_height_image_to_buffer"; - break; - case WEIGHT_WIDTH: - kernel_name = "weight_width_image_to_buffer"; - break; - case DW_CONV2D_FILTER: - case IN_OUT_WIDTH: - LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now"; - break; - } - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (output->dtype() == input->dtype()) { - built_options.emplace( - "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, - built_options, - &kernel_)); - } - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(output->opencl_buffer())); - if (type == CONV2D_FILTER) { - const index_t - inner_size = output->dim(1) * output->dim(2) * output->dim(3); - kernel_.setArg(idx++, static_cast(output->dim(0))); - kernel_.setArg(idx++, static_cast(output->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(3))); - kernel_.setArg(idx++, static_cast(inner_size)); - } else if (type == ARGUMENT) { - kernel_.setArg(idx++, static_cast(output->dim(0))); - } else if (type == WEIGHT_HEIGHT) { - kernel_.setArg(idx++, static_cast(output->dim(0))); - kernel_.setArg(idx++, static_cast(output->dim(1))); - kernel_.setArg(idx++, static_cast(output->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(3))); - } else { - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[1])); - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[2])); - kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[3])); - } - kernel_.setArg(idx++, *(input->opencl_image())); - input_shape_ = input->shape(); - } - - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {16, kwg_size / 16}; - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/lstm_cell.cc b/mace/ops/opencl/image/lstm_cell.cc new file mode 100644 index 0000000000000000000000000000000000000000..987d0b1b338eb20460fed030b94b7858efbd6211 --- /dev/null +++ b/mace/ops/opencl/image/lstm_cell.cc @@ -0,0 +1,104 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/lstm_cell.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus LSTMCellKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *pre_output, + const Tensor *weight, + const Tensor *bias, + const Tensor *pre_cell, + Tensor *cell, + Tensor *output) { + MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, + "LSTM hidden units should be a multiple of 4"); + + const index_t height = input->dim(0); + const index_t width = input->dim(1); + const index_t hidden_units = pre_output->dim(1); + const index_t w_blocks = hidden_units >> 2; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); + built_options.emplace("-Dlstmcell=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[2] = {static_cast(w_blocks), + static_cast(height)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + std::vector output_shape_padded = {height, 1, 1, hidden_units}; + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape_padded, + OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(), + output_image_shape)); + MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(), + output_image_shape)); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(pre_output->opencl_image())); + kernel_.setArg(idx++, *(weight->opencl_image())); + kernel_.setArg(idx++, *(bias->opencl_image())); + kernel_.setArg(idx++, *(pre_cell->opencl_image())); + kernel_.setArg(idx++, forget_bias_); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(hidden_units)); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(width))); + kernel_.setArg(idx++, *(cell->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = {kwg_size_ / 16, 16, 0}; + std::string tuning_key = + Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h index 1e45b2261edcdecbdbfe3b0c7a2a4dceed559306..006374f9d099df01f866231f1756c97ec4b16190 100644 --- a/mace/ops/opencl/image/lstm_cell.h +++ b/mace/ops/opencl/image/lstm_cell.h @@ -30,11 +30,10 @@ namespace ops { namespace opencl { namespace image { -template class LSTMCellKernel : public OpenCLLSTMCellKernel { public: explicit LSTMCellKernel( - const T forget_bias) + const float forget_bias) : forget_bias_(forget_bias) {} MaceStatus Compute( OpContext *context, @@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel { Tensor *output) override; private: - T forget_bias_; + float forget_bias_; cl::Kernel kernel_; uint32_t kwg_size_; std::vector input_shape_; }; -template -MaceStatus LSTMCellKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *pre_output, - const Tensor *weight, - const Tensor *bias, - const Tensor *pre_cell, - Tensor *cell, - Tensor *output) { - MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, - "LSTM hidden units should be a multiple of 4"); - - const index_t height = input->dim(0); - const index_t width = input->dim(1); - const index_t hidden_units = pre_output->dim(1); - const index_t w_blocks = hidden_units >> 2; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); - built_options.emplace("-Dlstmcell=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[2] = {static_cast(w_blocks), - static_cast(height)}; - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - std::vector output_shape_padded = {height, 1, 1, hidden_units}; - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape_padded, - OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(), - output_image_shape)); - MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(), - output_image_shape)); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(pre_output->opencl_image())); - kernel_.setArg(idx++, *(weight->opencl_image())); - kernel_.setArg(idx++, *(bias->opencl_image())); - kernel_.setArg(idx++, *(pre_cell->opencl_image())); - kernel_.setArg(idx++, static_cast(forget_bias_)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(hidden_units)); - kernel_.setArg(idx++, static_cast(RoundUpDiv4(width))); - kernel_.setArg(idx++, *(cell->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::string tuning_key = - Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - MACE_OUT_OF_RANGE_VALIDATION; - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/matmul.cc b/mace/ops/opencl/image/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..a16d845d09bd5d6778ba100b1f8b93b43ff07ddf --- /dev/null +++ b/mace/ops/opencl/image/matmul.cc @@ -0,0 +1,98 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/matmul.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus MatMulKernel::Compute( + OpContext *context, + const Tensor *A, + const Tensor *B, + Tensor *C, + bool transpose_a, + bool transpose_b) { + MACE_CHECK(!transpose_a && !transpose_b, + "GPU does not support transpose matmul"); + + index_t rank = A->dim_size(); + index_t height = A->dim(rank - 2); + index_t K = A->dim(rank - 1); + index_t width = B->dim(rank - 1); + index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, + std::multiplies()); + + std::vector c_shape = A->shape(); + c_shape[rank - 2] = height; + c_shape[rank - 1] = width; + std::vector c_image_shape; + std::vector padded_c_shape = {batch, height, width, 1}; + OpenCLUtil::CalImage2DShape(padded_c_shape, + OpenCLBufferType::IN_OUT_HEIGHT, + &c_image_shape); + MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); + + const index_t height_blocks = RoundUpDiv4(height); + const index_t width_blocks = RoundUpDiv4(width); + const uint32_t gws[2] = { + static_cast(width_blocks), + static_cast(height_blocks * batch), + }; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); + built_options.emplace("-Dmatmul=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(A->opencl_image())); + kernel_.setArg(idx++, *(B->opencl_image())); + kernel_.setArg(idx++, *(C->opencl_image())); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(K)); + kernel_.setArg(idx++, static_cast(height_blocks)); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); + + const std::vector lws = {kwg_size_ / 64, 64, 0}; + std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h index 1cd5e7b1688e269b5b2c0cd1bbb0c17539572545..afd4792cba2eb3a33ccbf88959481ebb0cb3f225 100644 --- a/mace/ops/opencl/image/matmul.h +++ b/mace/ops/opencl/image/matmul.h @@ -31,7 +31,6 @@ namespace ops { namespace opencl { namespace image { -template class MatMulKernel : public OpenCLMatMulKernel { public: MaceStatus Compute( @@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel { uint32_t kwg_size_; }; -template -MaceStatus MatMulKernel::Compute( - OpContext *context, - const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b) { - MACE_CHECK(!transpose_a && !transpose_b, - "GPU does not support transpose matmul"); - - index_t rank = A->dim_size(); - index_t height = A->dim(rank - 2); - index_t K = A->dim(rank - 1); - index_t width = B->dim(rank - 1); - index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, - std::multiplies()); - - std::vector c_shape = A->shape(); - c_shape[rank - 2] = height; - c_shape[rank - 1] = width; - std::vector c_image_shape; - std::vector padded_c_shape = {batch, height, width, 1}; - OpenCLUtil::CalImage2DShape(padded_c_shape, - OpenCLBufferType::IN_OUT_HEIGHT, - &c_image_shape); - MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); - - const index_t height_blocks = RoundUpDiv4(height); - const index_t width_blocks = RoundUpDiv4(width); - const uint32_t gws[2] = { - static_cast(width_blocks), - static_cast(height_blocks * batch), - }; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); - built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(A->opencl_image())); - kernel_.setArg(idx++, *(B->opencl_image())); - kernel_.setArg(idx++, *(C->opencl_image())); - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(K)); - kernel_.setArg(idx++, static_cast(height_blocks)); - kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); - - const std::vector lws = {kwg_size_ / 64, 64, 0}; - std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/pad.cc b/mace/ops/opencl/image/pad.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d057a69f120b8f86d00c771688375fdd7194f04 --- /dev/null +++ b/mace/ops/opencl/image/pad.cc @@ -0,0 +1,124 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/pad.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus PadKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + MACE_CHECK(this->paddings_.size() == + static_cast((input->dim_size() * 2))); + MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) && + (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) + << "Mace only support height/width dimension now"; + for (int i = 2; i <= 5; ++i) { + MACE_CHECK(paddings_[i] >= 0); + } + auto input_shape = input->shape(); + if (type_ == PadType::REFLECT) { + MACE_CHECK(paddings_[2] < input_shape[1] && + paddings_[3] < input_shape[1] && + paddings_[4] < input_shape[2] && + paddings_[5] < input_shape[2]); + } else if (type_ == PadType::SYMMETRIC) { + MACE_CHECK(paddings_[2] <= input_shape[1] && + paddings_[3] <= input_shape[1] && + paddings_[4] <= input_shape[2] && + paddings_[5] <= input_shape[2]); + } else { + MACE_CHECK(type_ == PadType::CONSTANT); + } + std::vector output_shape = { + input_shape[0] + this->paddings_[0] + this->paddings_[1], + input_shape[1] + this->paddings_[2] + this->paddings_[3], + input_shape[2] + this->paddings_[4] + this->paddings_[5], + input_shape[3] + this->paddings_[6] + this->paddings_[7]}; + + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); + built_options.emplace("-Dpad=" + kernel_name); + auto dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + + built_options.emplace(MakeString("-DPAD_TYPE=", type_)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + MACE_OUT_OF_RANGE_INIT(kernel_); + + if (!IsVecEqual(input_shape_, input->shape())) { + int idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + if (type_ == PadType::CONSTANT) { + kernel_.setArg(idx++, this->constant_value_); + } + kernel_.setArg(idx++, static_cast(input_shape[1])); + kernel_.setArg(idx++, static_cast(input_shape[2])); + kernel_.setArg(idx++, static_cast(output_shape[1])); + kernel_.setArg(idx++, this->paddings_[2]); + kernel_.setArg(idx++, this->paddings_[4]); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h index a8a5212312ebbe74c3dd15b45346841dbd3c5c9b..f4b8278bbbfc4f67e1e16622baac4517ac441fb6 100644 --- a/mace/ops/opencl/image/pad.h +++ b/mace/ops/opencl/image/pad.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/pad.h" +#include "mace/ops/common/pad_type.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -31,7 +31,6 @@ namespace ops { namespace opencl { namespace image { -template class PadKernel : public OpenCLPadKernel { public: PadKernel(const PadType type, @@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel { std::vector input_shape_; }; -template -MaceStatus PadKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - MACE_CHECK(this->paddings_.size() == - static_cast((input->dim_size() * 2))); - MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) && - (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) - << "Mace only support height/width dimension now"; - for (int i = 2; i <= 5; ++i) { - MACE_CHECK(paddings_[i] >= 0); - } - auto input_shape = input->shape(); - if (type_ == PadType::REFLECT) { - MACE_CHECK(paddings_[2] < input_shape[1] && - paddings_[3] < input_shape[1] && - paddings_[4] < input_shape[2] && - paddings_[5] < input_shape[2]); - } else if (type_ == PadType::SYMMETRIC) { - MACE_CHECK(paddings_[2] <= input_shape[1] && - paddings_[3] <= input_shape[1] && - paddings_[4] <= input_shape[2] && - paddings_[5] <= input_shape[2]); - } else { - MACE_CHECK(type_ == PadType::CONSTANT); - } - std::vector output_shape = { - input_shape[0] + this->paddings_[0] + this->paddings_[1], - input_shape[1] + this->paddings_[2] + this->paddings_[3], - input_shape[2] + this->paddings_[4] + this->paddings_[5], - input_shape[3] + this->paddings_[6] + this->paddings_[7]}; - - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); - built_options.emplace("-Dpad=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - built_options.emplace(MakeString("-DPAD_TYPE=", type_)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - MACE_OUT_OF_RANGE_INIT(kernel_); - - if (!IsVecEqual(input_shape_, input->shape())) { - int idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - if (type_ == PadType::CONSTANT) { - kernel_.setArg(idx++, this->constant_value_); - } - kernel_.setArg(idx++, static_cast(input_shape[1])); - kernel_.setArg(idx++, static_cast(input_shape[2])); - kernel_.setArg(idx++, static_cast(output_shape[1])); - kernel_.setArg(idx++, this->paddings_[2]); - kernel_.setArg(idx++, this->paddings_[4]); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/pooling.cc b/mace/ops/opencl/image/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..8d48e4d8997f9348ccdcd9de057753b271815991 --- /dev/null +++ b/mace/ops/opencl/image/pooling.cc @@ -0,0 +1,127 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/pooling.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus PoolingKernel::Compute( + OpContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const RoundType round_type, + Tensor *output) { + MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) + << "Pooling opencl kernel not support dilation yet"; + + std::vector output_shape(4); + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels[0], kernels[1]}; + + std::vector paddings(2); + if (padding_data.empty()) { + ops::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter_shape.data(), + padding_data.data(), dilations, strides, round_type, + output_shape.data()); + } + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); + built_options.emplace("-Dpooling=" + kernel_name); + + if (pooling_type == MAX && input->dtype() == output->dtype()) { + auto data_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + } + if (pooling_type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output->dim(3))), + static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)), + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, paddings[0] / 2); + kernel_.setArg(idx++, paddings[1] / 2); + kernel_.setArg(idx++, strides[0]); + kernel_.setArg(idx++, strides[1]); + kernel_.setArg(idx++, kernels[0]); + kernel_.setArg(idx++, kernels[1]); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = pooling::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h index 768a75caeb3f1fc00c32973f183cec7bf9c5979f..8d709368c8f9d2154cbd60eb07c1a9742fc2f506 100644 --- a/mace/ops/opencl/image/pooling.h +++ b/mace/ops/opencl/image/pooling.h @@ -57,7 +57,6 @@ inline std::vector LocalWS(OpenCLRuntime *runtime, } // namespace pooling -template class PoolingKernel : public OpenCLPoolingKernel { public: MaceStatus Compute( @@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel { std::vector input_shape_; }; -template -MaceStatus PoolingKernel::Compute( - OpContext *context, - const Tensor *input, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const RoundType round_type, - Tensor *output) { - MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) - << "Pooling opencl kernel not support dilation yet"; - - std::vector output_shape(4); - std::vector filter_shape = {input->dim(3), input->dim(3), - kernels[0], kernels[1]}; - - std::vector paddings(2); - if (padding_data.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), dilations, strides, - padding_type, output_shape.data(), paddings.data()); - } else { - paddings = padding_data; - CalcOutputSize(input->shape().data(), filter_shape.data(), - padding_data.data(), dilations, strides, round_type, - output_shape.data()); - } - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); - built_options.emplace("-Dpooling=" + kernel_name); - - if (pooling_type == MAX && input->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - } - if (pooling_type == AVG) { - built_options.emplace("-DPOOL_AVG"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = { - static_cast(RoundUpDiv4(output->dim(3))), - static_cast(output->dim(2)), - static_cast(output->dim(0) * output->dim(1)), - }; - MACE_OUT_OF_RANGE_INIT(kernel_); - - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(1))); - kernel_.setArg(idx++, paddings[0] / 2); - kernel_.setArg(idx++, paddings[1] / 2); - kernel_.setArg(idx++, strides[0]); - kernel_.setArg(idx++, strides[1]); - kernel_.setArg(idx++, kernels[0]); - kernel_.setArg(idx++, kernels[1]); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = pooling::LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/reduce.cc b/mace/ops/opencl/image/reduce.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee7e2ce1c0d99a9cab3e77c08826827b02805a0f --- /dev/null +++ b/mace/ops/opencl/image/reduce.cc @@ -0,0 +1,140 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/reduce.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ReduceKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + MACE_CHECK_NOTNULL(input); + index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t image_size = static_cast(in_height * in_width); + + std::vector gws(3); + std::vector lws(3); + std::vector output_shape{batch, 1, 1, channels}; + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); + built_options.emplace("-Dreduce=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(kernel_)); + gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; + } else { + // Ensure each kernel has at least 4 input elements. + gws = {4, image_size / 16, static_cast(batch * channel_blocks)}; + if (gws[1] == 0) { + gws[1] = 1; + } else if (gws[1] > 16) { + gws[1] = 16; + } + } + lws = {gws[0], gws[1], 1}; + const int group_num = lws[0] * lws[1] * lws[2]; + // Each kernel intends to compute compute_size elements. + const int compute_size = (image_size + group_num - 1) / group_num; + const int last_index = image_size % group_num; + const float scale = 1.f / (in_width * in_height); + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, (group_num * 4 * sizeof(float)), + nullptr); + kernel_.setArg(idx++, static_cast(group_num)); + kernel_.setArg(idx++, static_cast(compute_size)); + kernel_.setArg(idx++, static_cast(last_index)); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, scale); + kernel_.setArg(idx++, static_cast(channel_blocks)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index fa69a11621c5f395be237bed7867c356b576a844..992ac1b1491c1ccfeba27ad39b743ab568354797 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -24,20 +24,18 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/opencl/helper.h" -#include "mace/ops/reduce.h" +#include "mace/ops/common/reduce_type.h" namespace mace { namespace ops { namespace opencl { namespace image { -template class ReduceKernel : public OpenCLReduceKernel { public: ReduceKernel(ReduceType type, - const std::vector &axis, - const bool keep_dims) - : reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {} + const std::vector &axis) + : reduce_type_(type), axis_(axis) {} MaceStatus Compute( OpContext *context, @@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel { private: ReduceType reduce_type_; const std::vector axis_; - bool keep_dims_; cl::Kernel kernel_; uint32_t kwg_size_; std::vector input_shape_; }; -template -MaceStatus ReduceKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - MACE_CHECK_NOTNULL(input); - index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t image_size = static_cast(in_height * in_width); - - std::vector gws(3); - std::vector lws(3); - std::vector output_shape{batch, 1, 1, channels}; - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); - built_options.emplace("-Dreduce=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(kernel_)); - gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; - } else { - // Ensure each kernel has at least 4 input elements. - gws = {4, image_size / 16, static_cast(batch * channel_blocks)}; - if (gws[1] == 0) { - gws[1] = 1; - } else if (gws[1] > 16) { - gws[1] = 16; - } - } - lws = {gws[0], gws[1], 1}; - const int group_num = lws[0] * lws[1] * lws[2]; - // Each kernel intends to compute compute_size elements. - const int compute_size = (image_size + group_num - 1) / group_num; - const int last_index = image_size % group_num; - const float scale = 1.f / (in_width * in_height); - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, (group_num * 4 * sizeof(float)), - nullptr); - kernel_.setArg(idx++, static_cast(group_num)); - kernel_.setArg(idx++, static_cast(compute_size)); - kernel_.setArg(idx++, static_cast(last_index)); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, scale); - kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/resize_bicubic.cc b/mace/ops/opencl/image/resize_bicubic.cc new file mode 100644 index 0000000000000000000000000000000000000000..e09b5640d55c9a672a39146e4bbc3c683d21f06c --- /dev/null +++ b/mace/ops/opencl/image/resize_bicubic.cc @@ -0,0 +1,110 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/resize_bicubic.h" + +#include "mace/ops/common/utils.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ResizeBicubicKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t out_height = out_height_; + const index_t out_width = out_width_; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); + built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace( + MakeString("-DTABLE_SIZE=", common::utils::kTableSize)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_bicubic", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + float height_scale = + common::utils::CalculateResizeScale( + in_height, out_height, align_corners_); + float width_scale = + common::utils::CalculateResizeScale( + in_width, out_width, align_corners_); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + + input_shape_ = input->shape(); + } + + const std::vector + lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h index 31957af017b7ab82413595ad22bec73454e13029..cb215f19aa6a22fb3f919b2048b85e084c35667e 100644 --- a/mace/ops/opencl/image/resize_bicubic.h +++ b/mace/ops/opencl/image/resize_bicubic.h @@ -25,13 +25,14 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/opencl/helper.h" -#include "mace/ops/resize_bicubic.h" namespace mace { namespace ops { namespace opencl { namespace image { namespace resize_bicubic { +constexpr int64_t kTableSize = (1u << 10); + inline std::vector LocalWS(OpenCLRuntime *runtime, const uint32_t *gws, const uint32_t kwg_size) { @@ -60,7 +61,6 @@ inline std::vector LocalWS(OpenCLRuntime *runtime, } // namespace resize_bicubic -template class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { public: ResizeBicubicKernel(bool align_corners, @@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { std::vector input_shape_; }; -template -MaceStatus ResizeBicubicKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t out_height = out_height_; - const index_t out_width = out_width_; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - auto dt = DataTypeToEnum::value; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); - built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace( - MakeString("-DTABLE_SIZE=", - mace::ops::resize_bicubic::kTableSize)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("resize_bicubic", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape{batch, out_height, out_width, channels}; - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - float height_scale = - mace::ops::resize_bicubic::CalculateResizeScale( - in_height, out_height, align_corners_); - float width_scale = - mace::ops::resize_bicubic::CalculateResizeScale( - in_width, out_width, align_corners_); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, height_scale); - kernel_.setArg(idx++, width_scale); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, static_cast(out_height)); - - input_shape_ = input->shape(); - } - - const std::vector - lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/resize_bilinear.cc b/mace/ops/opencl/image/resize_bilinear.cc new file mode 100644 index 0000000000000000000000000000000000000000..91d82e821d2dc21da48990b22423962ee4decede --- /dev/null +++ b/mace/ops/opencl/image/resize_bilinear.cc @@ -0,0 +1,110 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/resize_bilinear.h" + +#include "mace/ops/common/utils.h" + + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ResizeBilinearKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t out_height = out_height_; + const index_t out_width = out_width_; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); + built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_bilinear", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + float height_scale = + common::utils::CalculateResizeScale(in_height, + out_height, + align_corners_); + float width_scale = + common::utils::CalculateResizeScale(in_width, + out_width, + align_corners_); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + + input_shape_ = input->shape(); + } + + const std::vector + lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h index 5b778122d59cb3dd6a768ceeb202413647b49691..68b1478dc81d620cc2bde198b02c221913b7939f 100644 --- a/mace/ops/opencl/image/resize_bilinear.h +++ b/mace/ops/opencl/image/resize_bilinear.h @@ -25,7 +25,6 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/opencl/helper.h" -#include "mace/ops/resize_bilinear.h" namespace mace { namespace ops { @@ -65,12 +64,11 @@ inline std::vector LocalWS(OpenCLRuntime *runtime, } // namespace resize_bilinear -template class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { public: ResizeBilinearKernel(bool align_corners, - const index_t out_height, - const index_t out_width) + const index_t out_height, + const index_t out_width) : align_corners_(align_corners), out_height_(out_height), out_width_(out_width) {} @@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { std::vector input_shape_; }; -template -MaceStatus ResizeBilinearKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t out_height = out_height_; - const index_t out_width = out_width_; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); - built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("resize_bilinear", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape{batch, out_height, out_width, channels}; - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - float height_scale = - mace::ops::resize_bilinear::CalculateResizeScale(in_height, - out_height, - align_corners_); - float width_scale = - mace::ops::resize_bilinear::CalculateResizeScale(in_width, - out_width, - align_corners_); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, height_scale); - kernel_.setArg(idx++, width_scale); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, static_cast(out_height)); - - input_shape_ = input->shape(); - } - - const std::vector - lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.cc b/mace/ops/opencl/image/resize_nearest_neighbor.cc new file mode 100644 index 0000000000000000000000000000000000000000..afb4b151d4ed0ea6ad17030025bf82123adf5d3d --- /dev/null +++ b/mace/ops/opencl/image/resize_nearest_neighbor.cc @@ -0,0 +1,110 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/resize_nearest_neighbor.h" + +#include "mace/ops/common/utils.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus ResizeNearestNeighborKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *size, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + Tensor::MappingGuard size_mapper(size); + const index_t out_height = size->data()[0]; + const index_t out_width = size->data()[1]; + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL( + "resize_nearest_neighbor_nocache"); + built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_nearest_neighbor", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + float height_scale = + common::utils::CalculateResizeScale( + in_height, out_height, align_corners_); + float width_scale = + common::utils::CalculateResizeScale( + in_width, out_width, align_corners_); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + kernel_.setArg(idx++, static_cast(align_corners_)); + + input_shape_ = input->shape(); + } + + const std::vector + lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.h b/mace/ops/opencl/image/resize_nearest_neighbor.h index 8f5bca6b029599e7a42899453279c4f77758196b..98ef37b28944521123996fbb38f6688d90a277c0 100644 --- a/mace/ops/opencl/image/resize_nearest_neighbor.h +++ b/mace/ops/opencl/image/resize_nearest_neighbor.h @@ -25,7 +25,6 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/opencl/helper.h" -#include "mace/ops/resize_nearest_neighbor.h" namespace mace { namespace ops { @@ -65,7 +64,6 @@ inline std::vector LocalWS(OpenCLRuntime *runtime, } // namespace resize_nearest_neighbor -template class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel { public: explicit ResizeNearestNeighborKernel(bool align_corners) @@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel { std::vector input_shape_; }; -template -MaceStatus ResizeNearestNeighborKernel::Compute( - OpContext *context, - const Tensor *input, - const Tensor *size, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - Tensor::MappingGuard size_mapper(size); - const index_t out_height = size->data()[0]; - const index_t out_width = size->data()[1]; - const index_t channel_blocks = RoundUpDiv4(channels); - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL( - "resize_nearest_neighbor_nocache"); - built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("resize_nearest_neighbor", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape{batch, out_height, out_width, channels}; - - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - float height_scale = - mace::ops::resize_nearest_neighbor::CalculateResizeScale( - in_height, out_height, align_corners_); - float width_scale = - mace::ops::resize_nearest_neighbor::CalculateResizeScale( - in_width, out_width, align_corners_); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, height_scale); - kernel_.setArg(idx++, width_scale); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, static_cast(out_height)); - kernel_.setArg(idx++, static_cast(align_corners_)); - - input_shape_ = input->shape(); - } - - const std::vector - lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0), - output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/softmax.cc b/mace/ops/opencl/image/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..f37b76d6f5ebec1fe9f4ab8b533848fca1dfd3be --- /dev/null +++ b/mace/ops/opencl/image/softmax.cc @@ -0,0 +1,98 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/softmax.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus SoftmaxKernel::Compute( + OpContext *context, + const Tensor *logits, + Tensor *output) { + index_t batch = 0; + index_t height = 0; + index_t width = 0; + index_t channels = 0; + + if (logits->dim_size() == 2) { + batch = logits->dim(0); + height = 1; + width = 1; + channels = logits->dim(1); + + } else if (logits->dim_size() == 4) { + batch = logits->dim(0); + height = logits->dim(1); + width = logits->dim(2); + channels = logits->dim(3); + } else { + MACE_NOT_IMPLEMENTED; + } + + const index_t channel_blocks = RoundUpDiv4(channels); + const int remain_channels = channel_blocks * 4 - channels; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); + built_options.emplace("-Dsoftmax=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + if (use_log_) + built_options.emplace("-DUSE_LOG"); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, logits->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(logits->opencl_image())); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, remain_channels); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = logits->shape(); + } + + std::vector lws = softmax::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("softmax_opencl_kernel", batch, height, width, channels); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h index 3aa84bb5091066bff8565d3428fca7ebe4badafd..505dff57c9a7caf718a4a7f98ab3d6ffe58a5565 100644 --- a/mace/ops/opencl/image/softmax.h +++ b/mace/ops/opencl/image/softmax.h @@ -56,7 +56,6 @@ inline std::vector LocalWS(OpenCLRuntime *runtime, } } // namespace softmax -template class SoftmaxKernel : public OpenCLSoftmaxKernel { public: explicit SoftmaxKernel(bool use_log) @@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { std::vector input_shape_; }; -template -MaceStatus SoftmaxKernel::Compute( - OpContext *context, - const Tensor *logits, - Tensor *output) { - index_t batch = 0; - index_t height = 0; - index_t width = 0; - index_t channels = 0; - - if (logits->dim_size() == 2) { - batch = logits->dim(0); - height = 1; - width = 1; - channels = logits->dim(1); - - } else if (logits->dim_size() == 4) { - batch = logits->dim(0); - height = logits->dim(1); - width = logits->dim(2); - channels = logits->dim(3); - } else { - MACE_NOT_IMPLEMENTED; - } - - const index_t channel_blocks = RoundUpDiv4(channels); - const int remain_channels = channel_blocks * 4 - channels; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); - built_options.emplace("-Dsoftmax=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (use_log_) - built_options.emplace("-DUSE_LOG"); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, logits->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(logits->opencl_image())); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, remain_channels); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = logits->shape(); - } - - std::vector lws = softmax::LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("softmax_opencl_kernel", batch, height, width, channels); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/space_to_batch.cc b/mace/ops/opencl/image/space_to_batch.cc new file mode 100644 index 0000000000000000000000000000000000000000..771d8e32ec7fd2ac9e887ad59b94c04aefde8b8f --- /dev/null +++ b/mace/ops/opencl/image/space_to_batch.cc @@ -0,0 +1,98 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/space_to_batch.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus SpaceToBatchKernel::Compute( + OpContext *context, + const Tensor *space_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *batch_tensor) { + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + batch_tensor->ResizeImage(output_shape, output_image_shape)); + const char *kernel_name = "space_to_batch"; + const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto input_dt = space_tensor->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt)); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, space_tensor->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); + kernel_.setArg(idx++, block_shape[0]); + kernel_.setArg(idx++, block_shape[1]); + kernel_.setArg(idx++, paddings[0]); + kernel_.setArg(idx++, paddings[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); + + input_shape_ = space_tensor->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h index 28f00df5fc7e549f6e58dd327de9544b68598fb1..6ad5d22833e2ff2104c974bd77f6da5c76af1ad3 100644 --- a/mace/ops/opencl/image/space_to_batch.h +++ b/mace/ops/opencl/image/space_to_batch.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { public: MaceStatus Compute( @@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { std::vector input_shape_; }; -template -MaceStatus SpaceToBatchKernel::Compute( - OpContext *context, - const Tensor *space_tensor, - const std::vector &paddings, - const std::vector &block_shape, - const std::vector &output_shape, - Tensor *batch_tensor) { - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - batch_tensor->ResizeImage(output_shape, output_image_shape)); - const char *kernel_name = "space_to_batch"; - const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); - const uint32_t gws[3] = { - chan_blk, static_cast(batch_tensor->dim(2)), - static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, space_tensor->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - kernel_.setArg(idx++, block_shape[0]); - kernel_.setArg(idx++, block_shape[1]); - kernel_.setArg(idx++, paddings[0]); - kernel_.setArg(idx++, paddings[2]); - kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); - - input_shape_ = space_tensor->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), - batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/space_to_depth.cc b/mace/ops/opencl/image/space_to_depth.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b48769b7afecee3dcf73f888653fd1cede42cc4 --- /dev/null +++ b/mace/ops/opencl/image/space_to_depth.cc @@ -0,0 +1,111 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/space_to_depth.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus SpaceToDepthKernel::Compute( + OpContext *context, + const Tensor *input, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_depth = input->dim(3); + + MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0, + "input channel should be dividable by 4"); + MACE_CHECK( + (input_width % block_size_ == 0) && (input_height % block_size_ == 0), + "input width and height should be dividable by block_size"); + + const index_t output_height = input_height / block_size_; + const index_t output_width = input_width / block_size_; + const index_t output_depth = input_depth * block_size_ * block_size_; + + const index_t output_depth_blocks = RoundUpDiv4(output_depth); + + std::vector output_shape = {batch, output_height, output_width, + output_depth}; + + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + const char *kernel_name = "space_to_depth"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + if (input_depth < 4) { + built_options.emplace(MakeString("-DDEPTH", input_depth)); + } + built_options.emplace(kernel_name_ss.str()); + auto input_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth", + obfuscated_kernel_name, + built_options, + &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(output_depth_blocks), + static_cast(output_width), + static_cast(output_height * batch)}; + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(input_height)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth)); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(output_height)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = Concat("space_to_depth", input->dim(0), + input->dim(1), input->dim(2), input->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h index e58b7b8d0660cc6c91d965557a17cb1c206f072e..324977ea45c518a4a7a46520f0b5626c82716ea2 100644 --- a/mace/ops/opencl/image/space_to_depth.h +++ b/mace/ops/opencl/image/space_to_depth.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { public: explicit SpaceToDepthKernel(const int block_size) @@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { std::vector input_shape_; }; -template -MaceStatus SpaceToDepthKernel::Compute( - OpContext *context, - const Tensor *input, - Tensor *output) { - const index_t batch = input->dim(0); - const index_t input_height = input->dim(1); - const index_t input_width = input->dim(2); - const index_t input_depth = input->dim(3); - - MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0, - "input channel should be dividable by 4"); - MACE_CHECK( - (input_width % block_size_ == 0) && (input_height % block_size_ == 0), - "input width and height should be dividable by block_size"); - - const index_t output_height = input_height / block_size_; - const index_t output_width = input_width / block_size_; - const index_t output_depth = input_depth * block_size_ * block_size_; - - const index_t output_depth_blocks = RoundUpDiv4(output_depth); - - std::vector output_shape = {batch, output_height, output_width, - output_depth}; - - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - const char *kernel_name = "space_to_depth"; - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - if (input_depth < 4) { - built_options.emplace(MakeString("-DDEPTH", input_depth)); - } - built_options.emplace(kernel_name_ss.str()); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth", - obfuscated_kernel_name, - built_options, - &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(output_depth_blocks), - static_cast(output_width), - static_cast(output_height * batch)}; - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(input_height)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth)); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(output_height)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = Concat("space_to_depth", input->dim(0), - input->dim(1), input->dim(2), input->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/split.cc b/mace/ops/opencl/image/split.cc new file mode 100644 index 0000000000000000000000000000000000000000..1df73c47e7339ba6e2a174d5271b03ca3f07056b --- /dev/null +++ b/mace/ops/opencl/image/split.cc @@ -0,0 +1,123 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/split.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus SplitKernel::Compute( + OpContext *context, + const Tensor *input, + const std::vector &output_list) { + MACE_UNUSED(axis_); + const index_t input_channels = input->dim(3); + const size_t outputs_count = output_list.size(); + const index_t output_channels = input_channels / outputs_count; + std::vector output_shape( + {input->dim(0), input->dim(1), input->dim(2), output_channels}); + + std::vector image_shape; + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + for (size_t i = 0; i < outputs_count; ++i) { + MACE_RETURN_IF_ERROR( + output_list[i]->ResizeImage(output_shape, image_shape)); + } + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); + built_options.emplace("-Dsplit=" + kernel_name); + auto input_dt = input->dtype(); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("split", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + const index_t channel_blk = RoundUpDiv4(output_channels); + + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(input->dim(2)), + static_cast(input->dim(0) * input->dim(1)), + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; + for (size_t i = 0; i < outputs_count; ++i) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(channel_blk * i)); + kernel_.setArg(idx++, *(output_list[i]->opencl_image())); + + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (context->future() != nullptr && runtime->is_profiling_enabled()) { + event.wait(); + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = + std::min(tmp_stats.start_micros, call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (context->future() != nullptr) { + context->future()->wait_fn = [call_stats](CallStats *stats) { + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h index 12755910a75cd812725b02dd76d35c052a6f6826..956ff6573a60ed2050d5b526f58734cdc8fdff43 100644 --- a/mace/ops/opencl/image/split.h +++ b/mace/ops/opencl/image/split.h @@ -31,7 +31,6 @@ namespace ops { namespace opencl { namespace image { -template class SplitKernel : public OpenCLSplitKernel { public: explicit SplitKernel(const int32_t axis) : axis_(axis) {} @@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel { uint32_t kwg_size_; }; -template -MaceStatus SplitKernel::Compute( - OpContext *context, - const Tensor *input, - const std::vector &output_list) { - const index_t input_channels = input->dim(3); - const size_t outputs_count = output_list.size(); - const index_t output_channels = input_channels / outputs_count; - std::vector output_shape( - {input->dim(0), input->dim(1), input->dim(2), output_channels}); - - std::vector image_shape; - OpenCLUtil::CalImage2DShape(output_shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); - for (size_t i = 0; i < outputs_count; ++i) { - MACE_RETURN_IF_ERROR( - output_list[i]->ResizeImage(output_shape, image_shape)); - } - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); - built_options.emplace("-Dsplit=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("split", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - const index_t channel_blk = RoundUpDiv4(output_channels); - - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(input->dim(2)), - static_cast(input->dim(0) * input->dim(1)), - }; - MACE_OUT_OF_RANGE_INIT(kernel_); - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - cl::Event event; - CallStats call_stats{INT64_MAX, 0}; - for (size_t i = 0; i < outputs_count; ++i) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(channel_blk * i)); - kernel_.setArg(idx++, *(output_list[i]->opencl_image())); - - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t j = 0; j < 3; ++j) { - roundup_gws[j] = RoundUp(gws[j], lws[j]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - if (context->future() != nullptr && runtime->is_profiling_enabled()) { - event.wait(); - CallStats tmp_stats; - runtime->GetCallStats(event, &tmp_stats); - call_stats.start_micros = - std::min(tmp_stats.start_micros, call_stats.start_micros); - call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; - } - } - if (context->future() != nullptr) { - context->future()->wait_fn = [call_stats](CallStats *stats) { - if (stats != nullptr) { - stats->start_micros = call_stats.start_micros; - stats->end_micros = stats->start_micros + call_stats.end_micros; - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/sqrdiff_mean.cc b/mace/ops/opencl/image/sqrdiff_mean.cc new file mode 100644 index 0000000000000000000000000000000000000000..442a319159f40349c84b6807ad25da529527ca78 --- /dev/null +++ b/mace/ops/opencl/image/sqrdiff_mean.cc @@ -0,0 +1,140 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/sqrdiff_mean.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +MaceStatus SqrDiffMeanKernel::Compute( + OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) { + MACE_CHECK_NOTNULL(input0); + MACE_CHECK_NOTNULL(input1); + MACE_CHECK(input0->dim(0) == input1->dim(0) && + input0->dim(3) == input1->dim(3)); + MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4, + "SqrDiffMean gpu only support 4-dim input"); + index_t batch = input0->dim(0); + const index_t in_height = input0->dim(1); + const index_t in_width = input0->dim(2); + const index_t channels = input0->dim(3); + const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t image_size = static_cast(in_height * in_width); + + std::vector gws(3); + std::vector lws(3); + std::vector output_shape{batch, 1, 1, channels}; + std::vector output_image_shape; + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean"); + built_options.emplace("-Dsqrdiff_mean=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(kernel_)); + gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; + } else { + gws = {4, 16, static_cast(batch * channel_blocks)}; + } + lws = {gws[0], gws[1], 1}; + const int group_size = lws[0] * lws[1] * lws[2]; + const int partial_len = (image_size + group_size - 1) / group_size; + const int remain_index = image_size % group_size; + const float img_size_reciprocal = 1.f / (in_width * in_height); + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input0->opencl_image())); + kernel_.setArg(idx++, *(input1->opencl_image())); + kernel_.setArg(idx++, (group_size * 4 * sizeof(float)), + nullptr); + kernel_.setArg(idx++, static_cast(group_size)); + kernel_.setArg(idx++, static_cast(partial_len)); + kernel_.setArg(idx++, static_cast(remain_index)); + kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, img_size_reciprocal); + kernel_.setArg(idx++, static_cast(channel_blocks)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input0->shape(); + } + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h index bde87a80896e0f76d4dbdf82acb130c8d9e80460..bd2d1e7f394e7ed98eb8bd4e948da32e615349be 100644 --- a/mace/ops/opencl/image/sqrdiff_mean.h +++ b/mace/ops/opencl/image/sqrdiff_mean.h @@ -30,7 +30,6 @@ namespace ops { namespace opencl { namespace image { -template class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { public: MaceStatus Compute( @@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { std::vector input_shape_; }; -template -MaceStatus SqrDiffMeanKernel::Compute( - OpContext *context, - const Tensor *input0, - const Tensor *input1, - Tensor *output) { - MACE_CHECK_NOTNULL(input0); - MACE_CHECK_NOTNULL(input1); - MACE_CHECK(input0->dim(0) == input1->dim(0) && - input0->dim(3) == input1->dim(3)); - MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4, - "SqrDiffMean gpu only support 4-dim input"); - index_t batch = input0->dim(0); - const index_t in_height = input0->dim(1); - const index_t in_width = input0->dim(2); - const index_t channels = input0->dim(3); - const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t image_size = static_cast(in_height * in_width); - - std::vector gws(3); - std::vector lws(3); - std::vector output_shape{batch, 1, 1, channels}; - std::vector output_image_shape; - OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean"); - built_options.emplace("-Dsqrdiff_mean=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(kernel_)); - gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; - } else { - gws = {4, 16, static_cast(batch * channel_blocks)}; - } - lws = {gws[0], gws[1], 1}; - const int group_size = lws[0] * lws[1] * lws[2]; - const int partial_len = (image_size + group_size - 1) / group_size; - const int remain_index = image_size % group_size; - const float img_size_reciprocal = 1.f / (in_width * in_height); - - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input0->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input0->opencl_image())); - kernel_.setArg(idx++, *(input1->opencl_image())); - kernel_.setArg(idx++, (group_size * 4 * sizeof(float)), - nullptr); - kernel_.setArg(idx++, static_cast(group_size)); - kernel_.setArg(idx++, static_cast(partial_len)); - kernel_.setArg(idx++, static_cast(remain_index)); - kernel_.setArg(idx++, static_cast(batch)); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, img_size_reciprocal); - kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input0->shape(); - } - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MaceStatus::MACE_SUCCESS; -} - } // namespace image } // namespace opencl } // namespace ops diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc index 40b83fa62e757b1f13a1e06c6f91b6db1e29ab1b..1ea2634a022e7614bcc600e3e34827e7a4aa8338 100644 --- a/mace/ops/opencl/image/winograd_conv2d.cc +++ b/mace/ops/opencl/image/winograd_conv2d.cc @@ -29,7 +29,6 @@ namespace { MaceStatus WinogradInputTransform(OpContext *context, cl::Kernel *kernel, const Tensor *input_tensor, - const DataType dt, const int *paddings, const index_t round_h, const index_t round_w, @@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context, MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); return MaceStatus::MACE_SUCCESS; } - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options, @@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context, kernel->setArg(idx++, static_cast(paddings[1] / 2)); } - const std::vector lws = {*kwg_size / 8, 8, 0}; std::string tuning_key = Concat("winograd_transform_kernel", output_tensor->dim(0), @@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context, cl::Kernel *kernel, const Tensor *input_tensor, const Tensor *bias, - const DataType dt, const index_t round_h, const index_t round_w, const int wino_blk_size, @@ -145,32 +142,40 @@ MaceStatus WinogradOutputTransform(OpContext *context, return MaceStatus::MACE_SUCCESS; } - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { - case NOOP: + case NOOP: { break; - case RELU: + } + case RELU: { built_options.emplace("-DUSE_RELU"); break; - case RELUX: + } + case RELUX: { built_options.emplace("-DUSE_RELUX"); break; - case PRELU: + } + case PRELU: { built_options.emplace("-DUSE_PRELU"); break; - case TANH: + } + case TANH: { built_options.emplace("-DUSE_TANH"); break; - case SIGMOID: + } + case SIGMOID: { built_options.emplace("-DUSE_SIGMOID"); break; - case LEAKYRELU: + } + case LEAKYRELU: { built_options.emplace("-DUSE_LEAKYRELU"); break; - default: + } + default: { LOG(FATAL) << "Unknown activation type: " << activation; + } } MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", @@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, const ActivationType activation, const float relux_max_limit, const float leakyrelu_coefficient, - const DataType dt, const int wino_blk_size, std::vector *prev_input_shape, Tensor *output, @@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, OpenCLBufferType::IN_OUT_HEIGHT, &t_input_image_shape); ScratchImage transformed_input_image(scratch_manager); - std::unique_ptr transformed_input = make_unique( - transformed_input_image.Scratch(context->device()->allocator(), - t_input_image_shape, dt), dt); + auto input_dt = input->dtype(); + auto image = transformed_input_image.Scratch(context->device()->allocator(), + t_input_image_shape, input_dt); + auto transformed_input = make_unique(image, input_dt); MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape, t_input_image_shape)); MACE_RETURN_IF_ERROR(WinogradInputTransform( - context, kernels[0], input, dt, paddings, + context, kernels[0], input, paddings, round_h, round_w, wino_blk_size, input_changed, transformed_input.get(), kwg_size[0], &t_input_future)); @@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, &mm_output_image_shape); ScratchImage mm_output_image(scratch_manager); + auto output_dt = input->dtype(); std::unique_ptr mm_output = make_unique( mm_output_image.Scratch(context->device()->allocator(), - mm_output_image_shape, dt), dt); + mm_output_image_shape, output_dt), output_dt); MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape, mm_output_image_shape)); @@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, built_options, kernels[1])); @@ -334,7 +340,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, const std::vector lws = {*kwg_size[1] / 64, 64, 0}; std::string tuning_key = Concat("matmul_opencl_kernel", mm_output_shape[0], - mm_output_shape[1], mm_output_shape[2]); + mm_output_shape[1], mm_output_shape[2]); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernels[1], tuning_key, gws, lws, &mm_future)); @@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, // t_output (blk_sqr, out_chan, out_width) -> output(NHWC) MACE_RETURN_IF_ERROR(WinogradOutputTransform( context, kernels[2], mm_output.get(), bias, - dt, round_h, round_w, wino_blk_size, activation, relux_max_limit, + round_h, round_w, wino_blk_size, activation, relux_max_limit, leakyrelu_coefficient, input_changed, output, kwg_size[2], &t_output_future)) diff --git a/mace/ops/opencl/lstm_cell.cc b/mace/ops/opencl/lstm_cell.cc index 563a53bcbd8ebedf5d694ecfd5d9a4252fd735ad..ce45c84401f89d42762c8a2c2bccbb57c35c08e1 100644 --- a/mace/ops/opencl/lstm_cell.cc +++ b/mace/ops/opencl/lstm_cell.cc @@ -25,21 +25,20 @@ namespace mace { namespace ops { -template +template class LSTMCellOp; #ifdef MACE_ENABLE_OPENCL -template -class LSTMCellOp : public Operation { +template<> +class LSTMCellOp : public Operation { public: explicit LSTMCellOp(OpConstructContext *context) : Operation(context) { - T forget_bias = static_cast( - Operation::GetOptionalArg("scalar_input", - 0.0)); + float forget_bias = Operation::GetOptionalArg("scalar_input", + 0.0); MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(forget_bias); + kernel_ = make_unique(forget_bias); } else { MACE_NOT_IMPLEMENTED; } @@ -47,30 +46,26 @@ class LSTMCellOp : public Operation { const Tensor *pre_output = context->workspace()->GetTensor( operator_def_->input(1)); if (pre_output->is_weight()) { - MACE_CHECK(TransformFilter(context, - operator_def_.get(), - 1, - OpenCLBufferType::IN_OUT_CHANNEL, - mem_type) == MaceStatus::MACE_SUCCESS); + auto status = TransformFilter(context, operator_def_.get(), + 1, OpenCLBufferType::IN_OUT_CHANNEL, + mem_type); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); } - MACE_CHECK(TransformFilter(context, - operator_def_.get(), - 2, - OpenCLBufferType::IN_OUT_CHANNEL, - mem_type) == MaceStatus::MACE_SUCCESS); - MACE_CHECK(TransformFilter(context, - operator_def_.get(), - 3, - OpenCLBufferType::ARGUMENT, - mem_type) == MaceStatus::MACE_SUCCESS); - const Tensor *pre_cell = context->workspace()->GetTensor( - operator_def_->input(4)); + auto status = TransformFilter(context, operator_def_.get(), + 2, OpenCLBufferType::IN_OUT_CHANNEL, + mem_type); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); + status = TransformFilter(context, operator_def_.get(), + 3, OpenCLBufferType::ARGUMENT, + mem_type); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); + const Tensor *pre_cell = + context->workspace()->GetTensor(operator_def_->input(4)); if (pre_cell->is_weight()) { - MACE_CHECK(TransformFilter(context, - operator_def_.get(), - 4, - OpenCLBufferType::IN_OUT_CHANNEL, - mem_type) == MaceStatus::MACE_SUCCESS); + status = TransformFilter(context, operator_def_.get(), + 4, OpenCLBufferType::IN_OUT_CHANNEL, + mem_type); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); } } @@ -92,14 +87,10 @@ class LSTMCellOp : public Operation { MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL); MACE_OP_OUTPUT_TAGS(CELL, OUTPUT); }; -#endif +#endif // MACE_ENABLE_OPENCL void RegisterLSTMCell(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, - DeviceType::GPU, half); + MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp); } } // namespace ops diff --git a/mace/ops/opencl/pooling.h b/mace/ops/opencl/pooling.h index 78628593f98209b7ab2ec3898e24bf370f573268..9d652cdcf05e76da2db2bb5ade66523b7d9e1ab1 100644 --- a/mace/ops/opencl/pooling.h +++ b/mace/ops/opencl/pooling.h @@ -17,7 +17,7 @@ #include -#include "mace/ops/pooling.h" +#include "mace/ops/common/pooling_type.h" #include "mace/ops/common/conv_pool_2d_util.h" namespace mace { diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 24130d7ae381222fb6219b4d335afc4a9e0c5723..49784c10db2c999b07faffe927aa6d6ebb061746 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -16,7 +16,7 @@ #include #include "mace/core/operator.h" -#include "mace/ops/pad.h" +#include "mace/ops/common/pad_type.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/pad.h" #endif // MACE_ENABLE_OPENCL @@ -26,10 +26,10 @@ namespace mace { namespace ops { -template +template class PadOp; -template +template class PadOp : public Operation { public: explicit PadOp(OpConstructContext *context) @@ -88,12 +88,12 @@ class PadOp : public Operation { for (index_t c = 0; c < channel; ++c) { for (index_t h = 0; h < height; ++h) { const index_t in_offset = (((b * channel + c) * height) + - h) * width; + h) * width; const index_t out_offset = - (((b + this->paddings_[0]) * output->dim(1) - + (c + this->paddings_[2])) * output->dim(2) - + (h + this->paddings_[4])) * output->dim(3) - + this->paddings_[6]; + (((b + this->paddings_[0]) * output->dim(1) + + (c + this->paddings_[2])) * output->dim(2) + + (h + this->paddings_[4])) * output->dim(3) + + this->paddings_[6]; memcpy(output_ptr + out_offset, input_ptr + in_offset, width * sizeof(T)); @@ -101,11 +101,11 @@ class PadOp : public Operation { } } } else if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) { - const index_t o_batch = output->dim(0); + const index_t o_batch = output->dim(0); const index_t o_channel = output->dim(1); - const index_t o_height = output->dim(2); - const index_t o_width = output->dim(3); - const int l_add = type_ == PadType::REFLECT ? 0 : -1; + const index_t o_height = output->dim(2); + const index_t o_width = output->dim(3); + const int l_add = type_ == PadType::REFLECT ? 0 : -1; const int r_add = type_ == PadType::REFLECT ? -2 : -1; for (index_t h = 0; h < o_height; ++h) { @@ -116,10 +116,10 @@ class PadOp : public Operation { for (index_t c = 0; c < o_channel; ++c) { index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add); - const index_t in_offset = (((b_in * channel + c_in) * height) + - h_in) * width; - index_t out_offset = (((b * o_channel + c) * o_height) + - h) * o_width; + const index_t in_offset = + (((b_in * channel + c_in) * height) + h_in) * width; + index_t out_offset = + (((b * o_channel + c) * o_height) + h) * o_width; for (index_t i = 0, j = paddings_[6] + l_add; i < paddings_[6]; ++i, --j) { @@ -169,8 +169,8 @@ class PadOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class PadOp : public Operation { +template<> +class PadOp : public Operation { public: explicit PadOp(OpConstructContext *context) : Operation(context) { @@ -180,7 +180,7 @@ class PadOp : public Operation { float constant_value = Operation::GetOptionalArg( "constant_value", 0.0); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>( + kernel_ = make_unique( type, paddings, constant_value); } else { MACE_NOT_IMPLEMENTED; @@ -198,18 +198,11 @@ class PadOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterPad(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Pad", PadOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Pad", PadOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Pad", PadOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp); } } // namespace ops diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index ef72ca9335657e09a50cff1ae523b2d4708f647f..4d4247f2b7236a0a3270c7d30a413c2885ca8256 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -16,8 +16,6 @@ #include #endif -#include "mace/ops/pooling.h" - #include #include #include @@ -28,6 +26,7 @@ #include "mace/core/tensor.h" #include "mace/ops/conv_pool_2d_base.h" #include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/common/pooling_type.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/pooling.h" #include "mace/ops/opencl/buffer/pooling.h" @@ -486,15 +485,15 @@ class PoolingOp : public PoolingOpBase { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class PoolingOp : public PoolingOpBase { +template<> +class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) : PoolingOpBase(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { - kernel_ = make_unique>(); + kernel_ = make_unique(); } } MaceStatus Run(OpContext *context) override { @@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Pooling", PoolingOp); } } // namespace ops diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index 27b34a91a32c214f22074e2f8605fdb29dd0d6f7..28083312872d269d49b9b509525aa5ee6021b6b0 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/reduce.h" - #include #include #include #include +#include "mace/ops/common/reduce_type.h" #include "mace/core/future.h" #include "mace/core/operator.h" #include "mace/core/runtime/cpu/cpu_runtime.h" @@ -868,15 +867,14 @@ void ReduceOp::Reduce4Dims( #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class ReduceOp : public ReduceOpBase { +template<> +class ReduceOp : public ReduceOpBase { public: explicit ReduceOp(OpConstructContext *context) : ReduceOpBase(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(reduce_type_, - axis_, - keep_dims_); + kernel_ = make_unique(reduce_type_, + axis_); } else { MACE_NOT_IMPLEMENTED; } @@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Reduce", ReduceOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Reduce") @@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } bool keep_dims = ProtoArgHelper::GetOptionalArg( *op, "keepdims", false); if (!keep_dims) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } auto axis = ProtoArgHelper::GetRepeatedArgs( *op, "axis"); if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } auto tensor_shape_info = context->tensor_shape_info(); if (tensor_shape_info->count(op->input(0)) == 0 || tensor_shape_info->at(op->input(0)).size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index 349f6423470b4db78df0f65e24b1dc1ae00bef58..5e48ad392e9c46269187b632f5d19c1c058ef081 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/resize_bicubic.h" - #include #include #include #include #include "mace/core/operator.h" +#include "mace/ops/common/utils.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_bicubic.h" #endif // MACE_ENABLE_OPENCL @@ -33,12 +32,12 @@ inline const std::shared_ptr InitCoeffsTable() { // convolution algorithm. // https://en.wikipedia.org/wiki/Bicubic_interpolation auto coeffs_tab = std::shared_ptr( - new float[(resize_bicubic::kTableSize + 1) * 2], + new float[(common::utils::kTableSize + 1) * 2], std::default_delete()); float *coeffs_tab_ptr = coeffs_tab.get(); static const float A = -0.75f; - for (int i = 0; i <= resize_bicubic::kTableSize; ++i) { - float x = i * 1.0f / resize_bicubic::kTableSize; + for (int i = 0; i <= common::utils::kTableSize; ++i) { + float x = i * 1.0f / common::utils::kTableSize; coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1; x += 1.0; coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; @@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit, std::vector *indices) { auto in_loc = static_cast(scale * out_loc); const float delta = scale * out_loc - in_loc; - const int64_t offset = lrintf(delta * resize_bicubic::kTableSize); + const int64_t offset = lrintf(delta * common::utils::kTableSize); const float *coeffs_tab = GetCoeffsTable(); *weights = {coeffs_tab[offset * 2 + 1], coeffs_tab[offset * 2], - coeffs_tab[(resize_bicubic::kTableSize - offset) * 2], - coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]}; + coeffs_tab[(common::utils::kTableSize - offset) * 2], + coeffs_tab[(common::utils::kTableSize - offset) * 2 + 1]}; *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit), Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}; } @@ -173,13 +172,13 @@ class ResizeBicubicOp : public Operation { } float height_scale = - resize_bicubic::CalculateResizeScale(in_height, - out_height, - align_corners_); + common::utils::CalculateResizeScale(in_height, + out_height, + align_corners_); float width_scale = - resize_bicubic::CalculateResizeScale(in_width, - out_width, - align_corners_); + common::utils::CalculateResizeScale(in_width, + out_width, + align_corners_); ResizeImage(context, input_data, @@ -202,8 +201,8 @@ class ResizeBicubicOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class ResizeBicubicOp : public Operation { +template<> +class ResizeBicubicOp : public Operation { public: explicit ResizeBicubicOp(OpConstructContext *context) : Operation(context) { @@ -213,7 +212,7 @@ class ResizeBicubicOp : public Operation { "size", {-1, -1}); MACE_CHECK(size.size() == 2); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>( + kernel_ = make_unique( align_corners, size[0], size[1]); } else { MACE_NOT_IMPLEMENTED; @@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "ResizeBicubic", ResizeBicubicOp); } } // namespace ops diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 09df62d880cad6a1f9ece73e5312a2b56df46340..e209864f15f1d18da6e6f96353f68e257252812e 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/resize_bilinear.h" - #include #include #include @@ -21,6 +19,7 @@ #include "mace/core/operator.h" #include "mace/utils/memory.h" #include "mace/core/quantize.h" +#include "mace/ops/common/utils.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_bilinear.h" #endif // MACE_ENABLE_OPENCL @@ -223,13 +222,13 @@ class ResizeBilinearOp : public Operation { } float height_scale = - resize_bilinear::CalculateResizeScale(in_height, - out_height, - align_corners_); + common::utils::CalculateResizeScale(in_height, + out_height, + align_corners_); float width_scale = - resize_bilinear::CalculateResizeScale(in_width, - out_width, - align_corners_); + common::utils::CalculateResizeScale(in_width, + out_width, + align_corners_); std::vector ys(out_height + 1); std::vector xs(out_width + 1); @@ -299,13 +298,13 @@ class ResizeBilinearOp : public Operation { } float height_scale = - resize_bilinear::CalculateResizeScale(in_height, - out_height, - align_corners_); + common::utils::CalculateResizeScale(in_height, + out_height, + align_corners_); float width_scale = - resize_bilinear::CalculateResizeScale(in_width, - out_width, - align_corners_); + common::utils::CalculateResizeScale(in_width, + out_width, + align_corners_); std::vector ys(out_height + 1); std::vector xs(out_width + 1); @@ -336,8 +335,8 @@ class ResizeBilinearOp : public Operation { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class ResizeBilinearOp : public Operation { +template<> +class ResizeBilinearOp : public Operation { public: explicit ResizeBilinearOp(OpConstructContext *context) : Operation(context) { @@ -347,7 +346,7 @@ class ResizeBilinearOp : public Operation { "size", {-1, -1}); MACE_CHECK(size.size() == 2); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>( + kernel_ = make_unique( align_corners, size[0], size[1]); } else { MACE_NOT_IMPLEMENTED; @@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "ResizeBilinear", ResizeBilinearOp); } } // namespace ops diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index 9e98e75e16313fc7d3093260feaa0207d40bcbd0..89ed473c44e43c5dd4c6415fe2badfd9f738c844 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/resize_nearest_neighbor.h" - #include #include #include #include "mace/core/operator.h" +#include "mace/ops/common/utils.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_nearest_neighbor.h" #endif // MACE_ENABLE_OPENCL @@ -115,13 +114,13 @@ class ResizeNearestNeighborOp : public Operation { } float height_scale = - resize_nearest_neighbor::CalculateResizeScale(in_height, - out_height, - align_corners_); + common::utils::CalculateResizeScale(in_height, + out_height, + align_corners_); float width_scale = - resize_nearest_neighbor::CalculateResizeScale(in_width, - out_width, - align_corners_); + common::utils::CalculateResizeScale(in_width, + out_width, + align_corners_); ResizeImageNCHW(context, input_data, batch, @@ -142,15 +141,15 @@ class ResizeNearestNeighborOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class ResizeNearestNeighborOp : public Operation { +template<> +class ResizeNearestNeighborOp : public Operation { public: explicit ResizeNearestNeighborOp(OpConstructContext *context) : Operation(context) { bool align_corners = Operation::GetOptionalArg( "align_corners", false); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>( + kernel_ = make_unique( align_corners); } else { MACE_NOT_IMPLEMENTED; @@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", ResizeNearestNeighborOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", - ResizeNearestNeighborOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", - ResizeNearestNeighborOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor", + ResizeNearestNeighborOp); } } // namespace ops diff --git a/mace/ops/resize_nearest_neighbor.h b/mace/ops/resize_nearest_neighbor.h deleted file mode 100644 index 0f27a219daf17329328321bd9132fad6ab5b462c..0000000000000000000000000000000000000000 --- a/mace/ops/resize_nearest_neighbor.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_ -#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_ - -#include "mace/core/types.h" - -namespace mace { -namespace ops { -namespace resize_nearest_neighbor { -inline float CalculateResizeScale(index_t in_size, - index_t out_size, - bool align_corners) { - return (align_corners && out_size > 1) - ? (in_size - 1) / static_cast(out_size - 1) - : in_size / static_cast(out_size); -} -} // namespace resize_nearest_neighbor -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_ diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index e32410989fe8c14cf936330769fd700eb0fe31b5..82a684b1a4056dcfa13be8b4c45aeb63e59781f2 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -35,10 +35,10 @@ namespace mace { namespace ops { -template +template class SoftmaxOp; -template <> +template<> class SoftmaxOp : public Operation { public: explicit SoftmaxOp(OpConstructContext *context) @@ -139,12 +139,12 @@ class SoftmaxOp : public Operation { sum = std::max(sum, std::numeric_limits::min()); if (use_log_) { for (index_t c = 0; c < class_count; ++c) { - output_ptr[c] /= sum; + output_ptr[c] /= sum; output_ptr[c] = std::log(output_ptr[c]); } } else { for (index_t c = 0; c < class_count; ++c) { - output_ptr[c] /= sum; + output_ptr[c] /= sum; } } } @@ -407,17 +407,17 @@ class SoftmaxOp : public Operation { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class SoftmaxOp : public Operation { +template<> +class SoftmaxOp : public Operation { public: explicit SoftmaxOp(OpConstructContext *context) : Operation(context) { bool use_log = ( Operation::GetOptionalArg("use_log", false)); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(use_log); + kernel_ = make_unique(use_log); } else { - kernel_ = make_unique>(use_log); + kernel_ = make_unique(use_log); } } MaceStatus Run(OpContext *context) override { @@ -433,7 +433,6 @@ class SoftmaxOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterSoftmax(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, DeviceType::CPU, float); @@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Softmax", SoftmaxOp); MACE_REGISTER_OP_CONDITION( op_registry, @@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } if (op->output_shape(0).dims_size() != 2 && op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index 50de3fc74b1104ccac8576e29a90911789dc91fd..156c2132289a487cb0db14d0bce05da85a31442d 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation { } }; -template +template class SpaceToBatchNDOp; -template <> +template<> class SpaceToBatchNDOp : public SpaceToBatchOpBase { public: explicit SpaceToBatchNDOp(OpConstructContext *context) @@ -302,13 +302,13 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -template -class SpaceToBatchNDOp : public SpaceToBatchOpBase { +template<> +class SpaceToBatchNDOp : public SpaceToBatchOpBase { public: explicit SpaceToBatchNDOp(OpConstructContext *context) : SpaceToBatchOpBase(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } @@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) { SpaceToBatchNDOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "SpaceToBatchND", - SpaceToBatchNDOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "SpaceToBatchND", - SpaceToBatchNDOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp); } } // namespace ops diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 9584ddb8d7d43f3cea7c5b0612e7bca24346070d..d9b5473629da962985261bc955dc591ef4b3a0f7 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -24,7 +24,7 @@ namespace mace { namespace ops { -template +template class SpaceToDepthOp : public Operation { public: explicit SpaceToDepthOp(OpConstructContext *context) @@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class SpaceToDepthOp : public Operation { +template<> +class SpaceToDepthOp : public Operation { public: explicit SpaceToDepthOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(block_size); + kernel_ = make_unique(block_size); } else { MACE_NOT_IMPLEMENTED; } @@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "SpaceToDepth", SpaceToDepthOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "SpaceToDepth", - SpaceToDepthOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "SpaceToDepth", - SpaceToDepthOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp); } } // namespace ops diff --git a/mace/ops/split.cc b/mace/ops/split.cc index b08d72c533d480a65cbff0c6fefb6a3b940322d6..ffe7172f841bb76be8e4428cdf9a30ac29ee27bd 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -100,14 +100,14 @@ class SplitOp : public Operation { }; #ifdef MACE_ENABLE_OPENCL -template -class SplitOp : public Operation { +template<> +class SplitOp : public Operation { public: explicit SplitOp(OpConstructContext *context) : Operation(context) { int32_t axis = Operation::GetOptionalArg("axis", 3); if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(axis); + kernel_ = make_unique(axis); } else { MACE_NOT_IMPLEMENTED; } @@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Split", SplitOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Split", SplitOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "Split", SplitOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp); MACE_REGISTER_OP_CONDITION( op_registry, diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index cd2fb1742f4a31992922deb357f4cfa788c032f8..2d85ed98448ba37e60572df7f87c6184ebbeddfb 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -24,7 +24,7 @@ namespace mace { namespace ops { -template +template class SqrDiffMeanOp : public Operation { public: explicit SqrDiffMeanOp(OpConstructContext *context) @@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation { } }; - #ifdef MACE_ENABLE_OPENCL -template -class SqrDiffMeanOp : public Operation { +template<> +class SqrDiffMeanOp : public Operation { public: explicit SqrDiffMeanOp(OpConstructContext *context) : Operation(context) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { - kernel_ = make_unique>(); + kernel_ = make_unique(); } else { MACE_NOT_IMPLEMENTED; } @@ -101,18 +100,11 @@ class SqrDiffMeanOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterSqrDiffMean(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp); } } // namespace ops diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 660a8e8f3dbfd8b54e701b5ff7714dc0c942aa3f..0c08cfd589b6d5c5f080432bffb62162706f15bc 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -20,18 +20,21 @@ namespace mace { namespace ops { -template -class SqueezeOp : public Operation { +class SqueezeOpRaw : public Operation { public: - explicit SqueezeOp(OpConstructContext *context) + explicit SqueezeOpRaw(OpConstructContext *context, + DeviceType device_type, + DataType data_type) : Operation(context), axis_(Operation::GetRepeatedArgs("axis", {})), - checked_(false) {} + checked_(false), + data_type_(data_type), + device_type_(device_type) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - if (!checked_ && D == DeviceType::CPU - && DataTypeToEnum::value != DT_UINT8) { + if (!checked_ && device_type_ == DeviceType::CPU + && data_type_ != DT_UINT8) { auto has_df = Operation::GetOptionalArg( "has_data_format", 0); if (has_df && this->Input(0)->dim_size() == 4) { @@ -62,6 +65,16 @@ class SqueezeOp : public Operation { private: std::vector axis_; bool checked_; + DataType data_type_; + DeviceType device_type_; +}; + +template +class SqueezeOp : public SqueezeOpRaw { + public: + explicit SqueezeOp(OpConstructContext *context) + : SqueezeOpRaw(context, D, DataTypeToEnum::value) { + } }; void RegisterSqueeze(OpRegistryBase *op_registry) { @@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL + MACE_REGISTER_GPU_OP(op_registry, "Squeeze", SqueezeOp); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Squeeze") @@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } if (op->output_shape(0).dims_size() != 2 && op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/python/tools/encrypt_opencl_codegen.py b/mace/python/tools/encrypt_opencl_codegen.py index 6fa3db4589bd883fe00433456808ef3b3c50c27e..2ef43a2d9ea2a9938e89250be2591079e9b8e5a4 100644 --- a/mace/python/tools/encrypt_opencl_codegen.py +++ b/mace/python/tools/encrypt_opencl_codegen.py @@ -37,55 +37,73 @@ def encrypt_code(code_str): return encrypted_arr +def create_output_dir(dir_path): + if os.path.exists(dir_path): + if os.path.isdir(dir_path): + try: + shutil.rmtree(dir_path) + except OSError: + raise RuntimeError( + "Cannot delete directory %s due to permission " + "error, inspect and remove manually" % dir_path) + else: + raise RuntimeError( + "Cannot delete non-directory %s, inspect ", + "and remove manually" % dir_path) + os.makedirs(dir_path) + + +def write_cl_encrypted_kernel_to_file( + encrypted_code_maps, template_path, output_path): + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + cl_encrypted_kernel = env.get_template(template_path).render( + tag='codegen', + maps=encrypted_code_maps, + data_type='unsigned char', + variable_name='kEncryptedProgramMap') + with open(output_path, "w") as w_file: + w_file.write(cl_encrypted_kernel) + + +def get_module_key(file_name): + module_key = None + if file_name[-3:] == ".cl": + module_key = file_name[:-3] + elif file_name[-2:] == ".h": + module_key = file_name + + return module_key + + def encrypt_opencl_codegen(cl_kernel_dir, output_path): if not os.path.exists(cl_kernel_dir): print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!") - header_code = "" - for file_name in os.listdir(cl_kernel_dir): - file_path = os.path.join(cl_kernel_dir, file_name) - if file_path[-2:] == ".h": - with open(file_path, "r") as f: - header_code += f.read() - encrypted_code_maps = {} for file_name in os.listdir(cl_kernel_dir): file_path = os.path.join(cl_kernel_dir, file_name) - if file_path[-3:] == ".cl": + module_key = get_module_key(file_name) + if len(module_key) > 0: with open(file_path, "r") as f: code_str = "" + headers = [] for line in f.readlines(): if "#include " in line: - code_str += header_code + headers.append(get_module_key("common.h")) else: code_str += line encrypted_code_arr = encrypt_code(code_str) - encrypted_code_maps[file_name[:-3]] = encrypted_code_arr - - env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - cpp_cl_encrypted_kernel = env.get_template( - 'str2vec_maps.cc.jinja2').render( - maps=encrypted_code_maps, - data_type='unsigned char', - variable_name='kEncryptedProgramMap') - - output_dir = os.path.dirname(output_path) - if os.path.exists(output_dir): - if os.path.isdir(output_dir): - try: - shutil.rmtree(output_dir) - except OSError: - raise RuntimeError( - "Cannot delete directory %s due to permission " - "error, inspect and remove manually" % output_dir) - else: - raise RuntimeError( - "Cannot delete non-directory %s, inspect ", - "and remove manually" % output_dir) - os.makedirs(output_dir) - - with open(output_path, "w") as w_file: - w_file.write(cpp_cl_encrypted_kernel) + encrypted_code = {} + encrypted_code['headers'] = headers + encrypted_code['code'] = encrypted_code_arr + encrypted_code_maps[module_key] = encrypted_code + + create_output_dir(os.path.dirname(output_path)) + write_cl_encrypted_kernel_to_file( + encrypted_code_maps, 'str2vec_maps.cc.jinja2', output_path) + output_path_h = output_path.replace('.cc', '.h') + write_cl_encrypted_kernel_to_file( + encrypted_code_maps, 'str2vec_maps.h.jinja2', output_path_h) print('Generate OpenCL kernel done.') diff --git a/mace/python/tools/str2vec_maps.cc.jinja2 b/mace/python/tools/str2vec_maps.cc.jinja2 index 513114941e8267528ed33eddd5b7f7ebb64a57ab..d88347172d0dd4f50d382a7c7598723db151f2e1 100644 --- a/mace/python/tools/str2vec_maps.cc.jinja2 +++ b/mace/python/tools/str2vec_maps.cc.jinja2 @@ -14,24 +14,32 @@ // This is a generated file. DO NOT EDIT! +#include "mace/codegen/opencl/encrypt_opencl_kernel.h" + #include #include -#include namespace mace { +namespace {{tag}} { -extern const std::map> {{variable_name}} = -{ - {% for key, value in maps.items() %} +const std::map {{variable_name}} = { +{% for key, encrypted_code in maps.items() %} { - "{{key}}", - { - {%- for ele in value -%} - {{ele}}, - {%- endfor -%} + "{{key}}", { + { + {%- for header in encrypted_code['headers'] -%} + "{{header}}", + {%- endfor -%} + }, + { + {%- for ele in encrypted_code['code'] -%} + {{ele}}, + {%- endfor -%} + } } }, // {{key}} {% endfor %} }; +} // {{tag}} } // namespace mace diff --git a/mace/ops/resize_bilinear.h b/mace/python/tools/str2vec_maps.h.jinja2 similarity index 54% rename from mace/ops/resize_bilinear.h rename to mace/python/tools/str2vec_maps.h.jinja2 index b5f50d29336b9af9cb4b756a15999074a566ed5b..9e89e416ebe1b67545538346431245c735104dc9 100644 --- a/mace/ops/resize_bilinear.h +++ b/mace/python/tools/str2vec_maps.h.jinja2 @@ -12,23 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_RESIZE_BILINEAR_H_ -#define MACE_OPS_RESIZE_BILINEAR_H_ +// This is a generated file. DO NOT EDIT! -#include "mace/core/types.h" +#include +#include +#include namespace mace { -namespace ops { -namespace resize_bilinear { -inline float CalculateResizeScale(index_t in_size, - index_t out_size, - bool align_corners) { - return (align_corners && out_size > 1) - ? (in_size - 1) / static_cast(out_size - 1) - : in_size / static_cast(out_size); -} -} // namespace resize_bilinear -} // namespace ops -} // namespace mace +namespace {{tag}} { + +struct ClProgramInfo { + const std::vector headers_; + const std::vector<{{data_type}}> encrypted_code_; +}; -#endif // MACE_OPS_RESIZE_BILINEAR_H_ +extern const std::map {{variable_name}}; + +} // {{tag}} +} // namespace mace diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index 63191cda20032c191992ea3624c13c121c585121..545af54d3dabab5ef7c9e34ccc2fbd9186c9f7c1 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): unused_var = repository_ctx.path(Label("//:.git/refs/heads/master")) ret = repository_ctx.execute( - ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path], + ["test", "-f", "%s/mace/ops/opencl/cl/common.cl" % mace_root_path], ) if ret.return_code == 0: unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl")) @@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): python_bin_path, "%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path, "--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path, - "--output_path=%s/encrypt_opencl_kernel" % generated_files_path, + "--output_path=%s/encrypt_opencl_kernel.cc" % generated_files_path, ], quiet = False) encrypt_opencl_kernel_repository = repository_rule( diff --git a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc index 6e5f7017e822dadb8d8c1044dc8875631fa6a28d..07685255c407a59e57f2edd2d01570bddf2e54bd 100644 --- a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc +++ b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc @@ -42,7 +42,7 @@ void FilterBufferToImage(int iters, "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); auto transform_func = [&]() { - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), OpenCLBufferType::IN_OUT_CHANNEL, diff --git a/test/ccbenchmark/mace/ops/pad_benchmark.cc b/test/ccbenchmark/mace/ops/pad_benchmark.cc index 2d0b4bc42a41a31ce4640b4d8227245084490515..27bd93bd6ba399f3d9616f5d6f8f7a8ce5847dc5 100644 --- a/test/ccbenchmark/mace/ops/pad_benchmark.cc +++ b/test/ccbenchmark/mace/ops/pad_benchmark.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "mace/benchmark_utils/test_benchmark.h" +#include "mace/ops/common/pad_type.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/pad.h" namespace mace { namespace ops { diff --git a/test/ccbenchmark/mace/ops/pooling_benchmark.cc b/test/ccbenchmark/mace/ops/pooling_benchmark.cc index 6b66a9fa7032ca29fc16fe888c9f532997ee37de..314cc6f90a98d9e732510869c0488bf50b3d478f 100644 --- a/test/ccbenchmark/mace/ops/pooling_benchmark.cc +++ b/test/ccbenchmark/mace/ops/pooling_benchmark.cc @@ -14,7 +14,7 @@ #include "mace/benchmark_utils/test_benchmark.h" #include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/pooling.h" +#include "mace/ops/common/pooling_type.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/test/ccunit/mace/ops/buffer_to_image_test.cc b/test/ccunit/mace/ops/buffer_to_image_test.cc index cb52eafe19bf27f926c36653889942a232edb2c5..644283d405f2a712c58707b83e3070893e2d2ba2 100644 --- a/test/ccunit/mace/ops/buffer_to_image_test.cc +++ b/test/ccunit/mace/ops/buffer_to_image_test.cc @@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type, Tensor *b2i_output = net.ws()->CreateTensor( "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, type, MemoryType::GPU_BUFFER, 0, i2b_output); @@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type, Tensor *b2i_output = net.ws()->CreateTensor( "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DT_FLOAT); - OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, type, MemoryType::GPU_BUFFER, 0, i2b_output); @@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type, "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); // Transform - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, type, MemoryType::GPU_BUFFER, 0, i2b_output); diff --git a/test/ccunit/mace/ops/buffer_transform_test.cc b/test/ccunit/mace/ops/buffer_transform_test.cc index a9af4bc9943fceb62d61e9ec7b13a58188230e83..f29a2e012249d5214ddedeaf9320aec80e71120c 100644 --- a/test/ccunit/mace/ops/buffer_transform_test.cc +++ b/test/ccunit/mace/ops/buffer_transform_test.cc @@ -45,8 +45,8 @@ void TestBidirectionTransform(const OpenCLBufferType type, "BtOutput", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, - MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), type, MemoryType::GPU_BUFFER, 0, bt_output); @@ -54,8 +54,8 @@ void TestBidirectionTransform(const OpenCLBufferType type, Tensor *output = net.ws()->CreateTensor( "Output", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, - MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) .Transform(&context, bt_output, type, MemoryType::GPU_BUFFER, 0, output); @@ -90,8 +90,8 @@ void TestArgumentTransform(const index_t input_size) { Tensor *output = net.ws()->CreateTensor( "Output", context.device()->allocator(), DataTypeToEnum::value); - OpenCLBufferTransformer(MemoryType::GPU_BUFFER, - MemoryType::GPU_BUFFER) + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER, 0, output); diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc index 8909f35113c5a77d78cf614970d9d027019f111c..8a17c2d2c5e5d9ed0431005404b630efdfd2c974 100644 --- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc +++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc @@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context, DtToCLCMDDt(DataTypeToEnum::value)); } else { built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); + DtToCLDt(DataTypeToEnum::value)); built_options.emplace( "-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); + DtToCLCMDDt(DataTypeToEnum::value)); } cl::Kernel kernel; diff --git a/test/ccunit/mace/ops/pad_test.cc b/test/ccunit/mace/ops/pad_test.cc index 977305597ae742866d2c1d63c48f571cfaa884e7..3d785ac7603b75d9a2e11ca65faeefb1cc40abbc 100644 --- a/test/ccunit/mace/ops/pad_test.cc +++ b/test/ccunit/mace/ops/pad_test.cc @@ -16,8 +16,8 @@ #include #include +#include "mace/ops/common/pad_type.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/pad.h" namespace mace { namespace ops { diff --git a/test/ccunit/mace/ops/pooling_test.cc b/test/ccunit/mace/ops/pooling_test.cc index 037cf8cf76e1926f941a92ea5eb1197b11e74b99..caa525c67b592dc44084f63093b3a20ad3aeb4c7 100644 --- a/test/ccunit/mace/ops/pooling_test.cc +++ b/test/ccunit/mace/ops/pooling_test.cc @@ -14,8 +14,8 @@ #include -#include "mace/ops/pooling.h" #include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/common/pooling_type.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/test/ccunit/mace/ops/reduce_test.cc b/test/ccunit/mace/ops/reduce_test.cc index 21a2dc13c3d63c8da97b47690b576d3d2499c6bf..753bf419debf706329b7c53898d2a561d0ff61ac 100644 --- a/test/ccunit/mace/ops/reduce_test.cc +++ b/test/ccunit/mace/ops/reduce_test.cc @@ -14,7 +14,7 @@ #include -#include "mace/ops/reduce.h" +#include "mace/ops/common/reduce_type.h" #include "mace/ops/ops_test_util.h" namespace mace {