提交 85cef1d8 编写于 作者: L luxuhui

adjust opencl code to minify the libmace.so's size

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 23d985f7
...@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA) ...@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
enable_language(CUDA) enable_language(CUDA)
endif(MACE_ENABLE_CUDA) endif(MACE_ENABLE_CUDA)
if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)) if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(ANDROID_ABI STREQUAL "arm64-v8a") if(ANDROID_ABI STREQUAL "arm64-v8a")
# Use gold linker to avoid linking check of libcdsprpc.so # Use gold linker to avoid linking check of libcdsprpc.so
set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold") set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
......
...@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation { ...@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class MyCustomOp<DeviceType::GPU, T> : public Operation { class MyCustomOp<DeviceType::GPU, float> : public Operation {
... ...
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) { ...@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp, MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -5,7 +5,7 @@ package( ...@@ -5,7 +5,7 @@ package(
default_visibility = ["//visibility:public"], default_visibility = ["//visibility:public"],
) )
load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule") load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
cc_library( cc_library(
name = "generated_models", name = "generated_models",
...@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule() ...@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
cc_library( cc_library(
name = "generated_opencl", name = "generated_opencl",
srcs = ["opencl/encrypt_opencl_kernel.cc"], srcs = ["opencl/encrypt_opencl_kernel.cc"],
hdrs = ["opencl/encrypt_opencl_kernel.h"],
copts = [ copts = [
"-Werror", "-Werror",
"-Wextra", "-Wextra",
......
...@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation( ...@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
std::string key = OpKeyBuilder(op_type) std::string key = OpKeyBuilder(op_type)
.Device(device_type) .Device(device_type)
.TypeConstraint("T", dtype) .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
.Build(); .Build();
if (registry_.at(op_type)->creators.count(key) == 0) { if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key; LOG(FATAL) << "Key not registered: " << key;
......
...@@ -39,7 +39,7 @@ class OpConditionContext { ...@@ -39,7 +39,7 @@ class OpConditionContext {
OpConditionContext(const Workspace *ws, TensorShapeMap *info); OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default; ~OpConditionContext() = default;
void set_operator_def(const OperatorDef* operator_def); void set_operator_def(const OperatorDef *operator_def);
inline const OperatorDef *operator_def() const { inline const OperatorDef *operator_def() const {
return operator_def_; return operator_def_;
...@@ -49,7 +49,7 @@ class OpConditionContext { ...@@ -49,7 +49,7 @@ class OpConditionContext {
return ws_; return ws_;
} }
inline void set_device(Device* device) { inline void set_device(Device *device) {
device_ = device; device_ = device;
} }
...@@ -110,7 +110,7 @@ class OpConstructContext { ...@@ -110,7 +110,7 @@ class OpConstructContext {
return ws_; return ws_;
} }
inline void set_device(Device* device) { inline void set_device(Device *device) {
device_ = device; device_ = device;
} }
...@@ -166,14 +166,14 @@ class Operation { ...@@ -166,14 +166,14 @@ class Operation {
explicit Operation(OpConstructContext *context); explicit Operation(OpConstructContext *context);
virtual ~Operation() = default; virtual ~Operation() = default;
template <typename T> template<typename T>
inline T GetOptionalArg(const std::string &name, inline T GetOptionalArg(const std::string &name,
const T &default_value) const { const T &default_value) const {
MACE_CHECK(operator_def_, "operator_def was null!"); MACE_CHECK(operator_def_, "operator_def was null!");
return ProtoArgHelper::GetOptionalArg<OperatorDef, T>( return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
*operator_def_, name, default_value); *operator_def_, name, default_value);
} }
template <typename T> template<typename T>
inline std::vector<T> GetRepeatedArgs( inline std::vector<T> GetRepeatedArgs(
const std::string &name, const std::vector<T> &default_value = {}) const { const std::string &name, const std::vector<T> &default_value = {}) const {
MACE_CHECK(operator_def_, "operator_def was null!"); MACE_CHECK(operator_def_, "operator_def was null!");
...@@ -240,7 +240,6 @@ class Operation { ...@@ -240,7 +240,6 @@ class Operation {
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \ #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ } enum _OutputTags { first_input = 0, __VA_ARGS__ }
struct OpRegistrationInfo { struct OpRegistrationInfo {
public: public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)> typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
...@@ -290,7 +289,6 @@ class OpConditionBuilder { ...@@ -290,7 +289,6 @@ class OpConditionBuilder {
OpRegistrationInfo::DataFormatSelector data_format_selector_; OpRegistrationInfo::DataFormatSelector data_format_selector_;
}; };
class OpRegistryBase { class OpRegistryBase {
public: public:
OpRegistryBase() = default; OpRegistryBase() = default;
...@@ -315,7 +313,7 @@ class OpRegistryBase { ...@@ -315,7 +313,7 @@ class OpRegistryBase {
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type) const; DeviceType device_type) const;
template <class DerivedType> template<class DerivedType>
static std::unique_ptr<Operation> DefaultCreator( static std::unique_ptr<Operation> DefaultCreator(
OpConstructContext *context) { OpConstructContext *context) {
return std::unique_ptr<Operation>(new DerivedType(context)); return std::unique_ptr<Operation>(new DerivedType(context));
...@@ -334,6 +332,24 @@ class OpRegistryBase { ...@@ -334,6 +332,24 @@ class OpRegistryBase {
DataTypeToEnum<dt>::value, \ DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name<device, dt>>) OpRegistryBase::DefaultCreator<class_name<device, dt>>)
#define MACE_REGISTER_OP_BY_CLASS( \
op_registry, op_type, class_name, device, dt) \
op_registry->Register(op_type, \
device, \
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name>)
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
op_type, \
DeviceType::GPU, \
DT_FLOAT, \
OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
#else
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
#endif
#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \ #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
op_registry->Register(builder) op_registry->Register(builder)
......
...@@ -18,20 +18,19 @@ ...@@ -18,20 +18,19 @@
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include <mutex> // NOLINT(build/c++11) #include <mutex> // NOLINT(build/c++11)
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include "mace/utils/macros.h" #include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include "mace/core/kv_storage.h" #include "mace/core/kv_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h" #include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/utils/macros.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
namespace mace { namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) { const std::string OpenCLErrorToString(cl_int error) {
switch (error) { switch (error) {
case CL_SUCCESS: case CL_SUCCESS:
...@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
const GPUPriorityHint priority_hint, const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint, const GPUPerfHint perf_hint,
std::shared_ptr<KVStorage> precompiled_binary_storage, std::shared_ptr<KVStorage> precompiled_binary_storage,
std::shared_ptr<Tuner<uint32_t>> tuner): std::shared_ptr<Tuner<uint32_t>> tuner) :
cache_storage_(cache_storage), cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage), precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner), tuner_(tuner),
...@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
#if CL_HPP_TARGET_OPENCL_VERSION >= 200 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) { if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
std::vector<cl_context_properties> context_properties = { std::vector<cl_context_properties> context_properties = {
CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(), CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback, CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0 CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
}; };
context_ = std::shared_ptr<cl::Context>( context_ = std::shared_ptr<cl::Context>(
...@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( ...@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true; return true;
} }
MaceStatus GetProgramSourceByName(const std::string &program_name,
std::string *source) {
MACE_CHECK_NOTNULL(source);
std::stringstream source_stream;
const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
const auto &it_program = kEncryptedProgramMap.find(program_name);
if (it_program == kEncryptedProgramMap.end()) {
LOG(ERROR) << "Find program " << program_name << " failed.";
return MaceStatus::MACE_RUNTIME_ERROR;
}
const std::vector<std::string> &headers = it_program->second.headers_;
for (const std::string &header : headers) {
const auto &header_program = kEncryptedProgramMap.find(header);
if (header_program == kEncryptedProgramMap.end()) {
LOG(WARNING) << "Program header(" << header << ") is empty.";
continue;
}
const auto &header_source = header_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(header_source.begin(), header_source.end()));
}
const auto &it_source = it_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(it_source.begin(), it_source.end()));
*source = source_stream.str();
return MaceStatus::MACE_SUCCESS;
}
bool OpenCLRuntime::BuildProgramFromSource( bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name, const std::string &program_name,
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
cl::Program *program) { cl::Program *program) {
// Find from source std::string kernel_source;
auto it_source = kEncryptedProgramMap.find(program_name); MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
if (it_source != kEncryptedProgramMap.end()) { if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
cl::Program::Sources sources; cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source); sources.push_back(kernel_source);
*program = cl::Program(context(), sources); *program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str()); cl_int ret = program->build({device()}, build_options_str.c_str());
......
...@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
*net_def, "opencl_mem_type", *net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE)); static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i); const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type); runtime->set_mem_type(mem_type);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
......
...@@ -118,9 +118,21 @@ def mace_version_genrule(): ...@@ -118,9 +118,21 @@ def mace_version_genrule():
) )
def encrypt_opencl_kernel_genrule(): def encrypt_opencl_kernel_genrule():
srcs = [
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
)),
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
)),
]
outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
native.genrule( native.genrule(
name = "encrypt_opencl_kernel_gen", name = "encrypt_opencl_kernel_gen",
srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))], srcs = srcs,
outs = ["opencl/encrypt_opencl_kernel.cc"], outs = outs,
cmd = "cat $(SRCS) > $@;" cmd = " && ".join([
"cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
for i in range(0, len(outs))
]),
) )
...@@ -181,7 +181,6 @@ cc_library( ...@@ -181,7 +181,6 @@ cc_library(
], ],
) )
cc_library( cc_library(
name = "internal_ops", name = "internal_ops",
srcs = glob( srcs = glob(
......
...@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation { ...@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ActivationOp<DeviceType::GPU, T> : public Operation { class ActivationOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ActivationOp(OpConstructContext *context) explicit ActivationOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
ActivationType type = ops::StringToActivationType( ActivationType type = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
"NOOP")); "NOOP"));
auto relux_max_limit = static_cast<T>( auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
Operation::GetOptionalArg<float>("max_limit", 0.0f)); auto leakyrelu_coefficient =
auto leakyrelu_coefficient = static_cast<T>( Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>( kernel_ = make_unique<opencl::image::ActivationKernel>(
type, relux_max_limit, leakyrelu_coefficient); type, relux_max_limit, leakyrelu_coefficient);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (type == ActivationType::PRELU) { if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation { ...@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
void RegisterActivation(OpRegistryBase *op_registry) { void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Activation") OpConditionBuilder("Activation")
...@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) { ...@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class AddNOp; class AddNOp;
template <> template<>
class AddNOp<DeviceType::CPU, float> : public Operation { class AddNOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit AddNOp(OpConstructContext *context) explicit AddNOp(OpConstructContext *context)
...@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation { ...@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class AddNOp<DeviceType::GPU, T> : public Operation { class AddNOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit AddNOp(OpConstructContext *context) explicit AddNOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>(); kernel_ = make_unique<opencl::image::AddNKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation { ...@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterAddN(OpRegistryBase *op_registry) { void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("AddN") OpConditionBuilder("AddN")
...@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) { ...@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BatchNormOp<DeviceType::GPU, T> : public Operation { class BatchNormOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BatchNormOp(OpConstructContext *context) explicit BatchNormOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>( kernel_ = make_unique<opencl::image::BatchNormKernel>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient); epsilon, activation, relux_max_limit, leakyrelu_coefficient);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
const Tensor *input_tensor = context->workspace()->GetTensor( const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i)); operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr); MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
...@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
void RegisterBatchNorm(OpRegistryBase *op_registry) { void RegisterBatchNorm(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation { ...@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
} }
}; };
template <DeviceType D, class T> template<DeviceType D, class T>
class BatchToSpaceNDOp; class BatchToSpaceNDOp;
template <> template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
...@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
} }
}; };
template <> template<>
class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
...@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase { ...@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) { : BatchToSpaceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>(); kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase { ...@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float); BatchToSpaceNDOp, DeviceType::CPU, float);
...@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { ...@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t); BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -34,16 +34,16 @@ ...@@ -34,16 +34,16 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class BiasAddOp; class BiasAddOp;
template <> template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation { class BiasAddOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit BiasAddOp(OpConstructContext *context) explicit BiasAddOp(OpConstructContext *context)
: Operation(context), : Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)) has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
{} 0)) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
...@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation { ...@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BiasAddOp<DeviceType::GPU, T> : public Operation { class BiasAddOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BiasAddOp(OpConstructContext *context) explicit BiasAddOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type = MemoryType::CPU_BUFFER; MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>(); kernel_ = make_unique<opencl::image::BiasAddKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterBiasAdd(OpRegistryBase *op_registry) { void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("BiasAdd") OpConditionBuilder("BiasAdd")
...@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { ...@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -23,10 +23,10 @@ ...@@ -23,10 +23,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class ChannelShuffleOp; class ChannelShuffleOp;
template <typename T> template<typename T>
class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit ChannelShuffleOp(OpConstructContext *context) explicit ChannelShuffleOp(OpConstructContext *context)
...@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { ...@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const int groups_; const int groups_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ChannelShuffleOp<DeviceType::GPU, T> : public Operation { class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ChannelShuffleOp(OpConstructContext *context) explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1); const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups); kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation { ...@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterChannelShuffle(OpRegistryBase *op_registry) { void RegisterChannelShuffle(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle", MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float); ChannelShuffleOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
...@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { ...@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1); *op, "group", 1);
if (op->output_shape(0).dims_size() != 4) { if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
index_t channels = op->output_shape(0).dims(3); index_t channels = op->output_shape(0).dims(3);
index_t channels_per_group = channels / groups; index_t channels_per_group = channels / groups;
if (groups % 4 != 0 || channels_per_group % 4 != 0) { if (groups % 4 != 0 || channels_per_group % 4 != 0) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_PAD_H_ #ifndef MACE_OPS_COMMON_PAD_TYPE_H_
#define MACE_OPS_PAD_H_ #define MACE_OPS_COMMON_PAD_TYPE_H_
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -27,4 +27,4 @@ enum PadType { ...@@ -27,4 +27,4 @@ enum PadType {
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_PAD_H_ #endif // MACE_OPS_COMMON_PAD_TYPE_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_POOLING_H_ #ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
#define MACE_OPS_POOLING_H_ #define MACE_OPS_COMMON_POOLING_TYPE_H_
namespace mace { namespace mace {
...@@ -23,4 +23,4 @@ enum PoolingType { ...@@ -23,4 +23,4 @@ enum PoolingType {
}; };
} // namespace mace } // namespace mace
#endif // MACE_OPS_POOLING_H_ #endif // MACE_OPS_COMMON_POOLING_TYPE_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_REDUCE_H_ #ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
#define MACE_OPS_REDUCE_H_ #define MACE_OPS_COMMON_REDUCE_TYPE_H_
namespace mace { namespace mace {
...@@ -28,4 +28,4 @@ enum ReduceType { ...@@ -28,4 +28,4 @@ enum ReduceType {
}; };
} // namespace mace } // namespace mace
#endif // MACE_OPS_REDUCE_H_ #endif // MACE_OPS_COMMON_REDUCE_TYPE_H_
...@@ -12,14 +12,16 @@ ...@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_RESIZE_BICUBIC_H_ #ifndef MACE_OPS_COMMON_UTILS_H_
#define MACE_OPS_RESIZE_BICUBIC_H_ #define MACE_OPS_COMMON_UTILS_H_
#include "mace/core/types.h" #include "mace/core/types.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace resize_bicubic { namespace common {
namespace utils {
constexpr int64_t kTableSize = (1u << 10); constexpr int64_t kTableSize = (1u << 10);
inline float CalculateResizeScale(index_t in_size, inline float CalculateResizeScale(index_t in_size,
...@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size, ...@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
? (in_size - 1) / static_cast<float>(out_size - 1) ? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size); : in_size / static_cast<float>(out_size);
} }
} // namespace resize_bicubic
} // namespace utils
} // namespace common
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_RESIZE_BICUBIC_H_ #endif // MACE_OPS_COMMON_UTILS_H_
...@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation { ...@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
int axis_; int axis_;
}; };
template <DeviceType D, class T> template<DeviceType D, class T>
class ConcatOp; class ConcatOp;
template <typename T> template<typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase { class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public: public:
explicit ConcatOp(OpConstructContext *context) explicit ConcatOp(OpConstructContext *context)
...@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase { ...@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase { class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
public: public:
explicit ConcatOp(OpConstructContext *context) explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) { : ConcatOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>(); kernel_ = make_unique<opencl::image::ConcatKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase { ...@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterConcat(OpRegistryBase *op_registry) { void RegisterConcat(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float); DeviceType::CPU, float);
...@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
...@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
auto tensor_shape_info = context->tensor_shape_info(); auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) { if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} else { } else {
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
...@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3); *op, "axis", 3);
if (!has_data_format || axis != 3) { if (!has_data_format || axis != 3) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
bool divisible_four = true; bool divisible_four = true;
for (const std::string &input : op->input()) { for (const std::string &input : op->input()) {
...@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) {
} }
// Only support not divisible 4 case with 2 inputs. // Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) { if (op->input_size() > 2 && !divisible_four) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase { ...@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
public: public:
explicit Conv2dOp(OpConstructContext *context) explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context), : ConvPool2dOpBase(context),
...@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>(); kernel_ = make_unique<opencl::image::Conv2dKernel>();
} else { } else {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
} }
// Transform filter tensor to target format // Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) && if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
...@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
strides_.data(), strides_.data(),
dilations_.data(), dilations_.data(),
&wino_block_size_))) { &wino_block_size_))) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} else { } else {
wino_block_size_ = 0; wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type) OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) { ...@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -24,10 +24,10 @@ ...@@ -24,10 +24,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class CropOp; class CropOp;
template <class T> template<class T>
class CropOp<DeviceType::CPU, T> : public Operation { class CropOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit CropOp(OpConstructContext *context) explicit CropOp(OpConstructContext *context)
...@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs."); MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
...@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
const T * input_data = input0->data<T>(); const T *input_data = input0->data<T>();
crop_copy(input_data, output_data, input0->shape(), crop_copy(input_data, output_data, input0->shape(),
output_shape, offsets.data()); output_shape, offsets.data());
...@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
} }
private: private:
void crop_copy(const T* input_data, T* output_data, void crop_copy(const T *input_data, T *output_data,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const int32_t* offsets) { const int32_t *offsets) {
const index_t out_img_size = const index_t out_img_size =
output_shape[1] * output_shape[2] * output_shape[3]; output_shape[1] * output_shape[2] * output_shape[3];
const index_t out_hw = output_shape[2] * output_shape[3]; const index_t out_hw = output_shape[2] * output_shape[3];
...@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
for (int b = 0; b < output_shape[0]; ++b) { for (int b = 0; b < output_shape[0]; ++b) {
for (int c = 0; c < output_shape[1]; ++c) { for (int c = 0; c < output_shape[1]; ++c) {
for (int h = 0; h < output_shape[2]; ++h) { for (int h = 0; h < output_shape[2]; ++h) {
T* out_ptr = T *out_ptr =
output_data + b * out_img_size + c * out_hw + h * output_shape[3]; output_data + b * out_img_size + c * out_hw + h * output_shape[3];
const T* in_ptr_bch = const T *in_ptr_bch =
input_data + (b + offsets[0]) * in_img_size + input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw + (c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3]; (h + offsets[2]) * input_shape[3] + offsets[3];
...@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class CropOp<DeviceType::GPU, T> : public Operation { class CropOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit CropOp(OpConstructContext *context) explicit CropOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>( kernel_ = make_unique<opencl::image::CropKernel>(
Operation::GetRepeatedArgs<int>("offset")); Operation::GetRepeatedArgs<int>("offset"));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation { ...@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistryBase *op_registry) { void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp, MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Crop") OpConditionBuilder("Crop")
...@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) { ...@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<>
class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public: public:
explicit Deconv2dOp(OpConstructContext *context) explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>(); kernel_ = make_unique<opencl::image::Deconv2dKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type) OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) { if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) { if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
} }
} else { } else {
if (operator_def_->input_size() >= 4) { if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
3, 3,
...@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { ...@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
void RegisterDeconv2D(OpRegistryBase *op_registry) { void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Deconv2D") OpConditionBuilder("Deconv2D")
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class DepthToSpaceOp : public Operation { class DepthToSpaceOp : public Operation {
public: public:
explicit DepthToSpaceOp(OpConstructContext *context) explicit DepthToSpaceOp(OpConstructContext *context)
...@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation { ...@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthToSpaceOp<DeviceType::GPU, T> : public Operation { class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit DepthToSpaceOp(OpConstructContext *context) explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1); int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size); kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) { ...@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace", MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float); DepthToSpaceOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t> ...@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
public: public:
explicit DepthwiseConv2dOp(OpConstructContext *context) explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) { : DepthwiseConv2dOpBase(context) {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>(); kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
} else { } else {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
} }
Tensor *filter_tensor = context->workspace()->GetTensor( Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1)); operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) { if (filter_tensor != nullptr && filter_tensor->is_weight()) {
// Transform filter tensor to target format // Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
1, 1,
...@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { ...@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} }
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { ...@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DepthwiseConv2dOp, DeviceType::CPU, uint8_t); DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", #ifdef MACE_ENABLE_OPENCL
DepthwiseConv2dOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("DepthwiseConv2d") OpConditionBuilder("DepthwiseConv2d")
......
...@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public: public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context) explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>(); kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type) OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) { if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
} }
...@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) { ...@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float); DepthwiseDeconv2dOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class EltwiseOp<DeviceType::GPU, T> : public Operation { class EltwiseOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit EltwiseOp(OpConstructContext *context) explicit EltwiseOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation { ...@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>( kernel_ = make_unique<opencl::image::EltwiseKernel>(
type, coeff, scalar_input, scalar_input_index); type, coeff, scalar_input, scalar_input_index);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation { ...@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
if (ws->HasTensor(operator_def_->input(i)) && if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) { ws->GetTensor(operator_def_->input(i))->is_weight()) {
if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) { if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
OpenCLBufferType::ARGUMENT, OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) { } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
...@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) { ...@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t> ...@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase { class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
public: public:
explicit FullyConnectedOp(OpConstructContext *context) explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) { : FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER; MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>(); kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
// Transform filter tensor to target format // Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
1, 1,
OpenCLBufferType::WEIGHT_WIDTH, OpenCLBufferType::WEIGHT_WIDTH,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) { ...@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
FullyConnectedOp, DeviceType::CPU, uint8_t); FullyConnectedOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T>
class IdentityOp : public Operation { class IdentityOp : public Operation {
public: public:
explicit IdentityOp(OpConstructContext *context) explicit IdentityOp(OpConstructContext *context)
...@@ -34,15 +33,13 @@ class IdentityOp : public Operation { ...@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
}; };
void RegisterIdentity(OpRegistryBase *op_registry) { void RegisterIdentity(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float); DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T>
class InferConv2dShapeOp : public Operation { class InferConv2dShapeOp : public Operation {
public: public:
explicit InferConv2dShapeOp(OpConstructContext *context) explicit InferConv2dShapeOp(OpConstructContext *context)
...@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation { ...@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation {
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1; out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
} else { } else {
switch (padding_type) { switch (padding_type) {
case SAME: case SAME: {
out_h = (in_h + strides[0] - 1) / strides[0]; out_h = (in_h + strides[0] - 1) / strides[0];
out_w = (in_w + strides[1] - 1) / strides[1]; out_w = (in_w + strides[1] - 1) / strides[1];
break; break;
case VALID: }
case VALID: {
out_h = (in_h - kernels[2] + 1) / strides[0]; out_h = (in_h - kernels[2] + 1) / strides[0];
out_w = (in_w - kernels[3] + 1) / strides[1]; out_w = (in_w - kernels[3] + 1) / strides[1];
break; break;
default: }
default: {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
break; break;
} }
} }
}
if (isNCHW) { if (isNCHW) {
output_data[0] = out_batch; output_data[0] = out_batch;
...@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation { ...@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
}; };
void RegisterInferConv2dShape(OpRegistryBase *op_registry) { void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float); InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t); InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float); InferConv2dShapeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase { ...@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase { class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
public: public:
explicit MatMulOp(OpConstructContext *context) explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) { : MatMulOpBase(context) {
...@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase { ...@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
}; };
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) { void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float); DeviceType::CPU, float);
...@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) { ...@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
...@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter( ...@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t out_chan = input->dim(0); const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1); const index_t in_chan = input->dim(1);
...@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter( ...@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
MACE_OUT_OF_RANGE_CONFIG; MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name); built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter( ...@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t multiplier = input->dim(0); const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1); const index_t in_chan = input->dim(1);
...@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter( ...@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name); built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -164,7 +164,6 @@ MaceStatus TransformArgument( ...@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t size = input->dim(0); const index_t size = input->dim(0);
...@@ -181,8 +180,9 @@ MaceStatus TransformArgument( ...@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name); built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -229,6 +229,30 @@ MaceStatus TransformArgument( ...@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus BufferTransform::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, output);
default:
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform( ...@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformConv2DFilter( MaceStatus TransformConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformDWConv2DFilter( MaceStatus TransformDWConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformArgument( MaceStatus TransformArgument(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
class BufferTransform : public OpenCLBufferTransformKernel {
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
...@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel { ...@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, dt, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input, dt, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform( ...@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
...@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform( ...@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name); built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(kwg_size_);
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context, ...@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, ...@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, ...@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
StatsFuture *future); StatsFuture *future);
} // namespace conv2d } // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel { class Conv2dKernel : public OpenCLConv2dKernel {
public: public:
Conv2dKernel() : old_scratch_size_(0) {} Conv2dKernel() : old_scratch_size_(0) {}
...@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel { ...@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context, ...@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context, ...@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name); built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); std::string data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
......
...@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context, ...@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context, ...@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name); built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); std::string pad_data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); std::string out_data_dt = DtToCLDt(output->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
......
...@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
} }
} // namespace depthwise } // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
StatsFuture *future); StatsFuture *future);
} // namespace depthwise } // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public: public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {} DepthwiseConv2dKernel() : old_scratch_size_(0) {}
...@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { ...@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
private: private:
index_t old_scratch_size_; index_t old_scratch_size_;
cl::Kernel kernels_[2]; cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
auto input_dtype = input->dtype();
auto input_dt = DtToCLDt(input_dtype);
built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
auto output_dtype = output->dtype();
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
if (pooling_type == MAX && input_dtype == output_dtype) {
built_options.emplace("-DDATA_TYPE=" + input_dt);
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace buffer { namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel { class PoolingKernel : public OpenCLPoolingKernel {
public: public:
PoolingKernel() : old_scratch_size_(0) {} PoolingKernel() : old_scratch_size_(0) {}
...@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel { ...@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -29,7 +29,7 @@ namespace mace { ...@@ -29,7 +29,7 @@ namespace mace {
namespace ops { namespace ops {
namespace opencl { namespace opencl {
namespace buffer { namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel { class SoftmaxKernel : public OpenCLSoftmaxKernel {
public: public:
explicit SoftmaxKernel(bool use_log) explicit SoftmaxKernel(bool use_log)
...@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { ...@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -20,11 +20,11 @@ ...@@ -20,11 +20,11 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class BufferTransformOp; class BufferTransformOp;
template <typename T> template<>
class BufferTransformOp<DeviceType::GPU, T> : public Operation { class BufferTransformOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BufferTransformOp(OpConstructContext *context) explicit BufferTransformOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType in_mem_type = context->workspace()->GetTensor( MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type(); operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform( return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output); context, input, type, out_mem_type_, wino_blk_size_, output);
} }
...@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType out_mem_type_; MemoryType out_mem_type_;
}; };
void RegisterBufferTransform(OpRegistryBase *op_registry) { void RegisterBufferTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferTransform", MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
BufferTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, half);
} }
} // namespace ops } // namespace ops
......
...@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) { ...@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
return name + postfix; return name + postfix;
} }
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size) {
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
const DataType dt = input->dtype();
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -28,17 +28,16 @@ ...@@ -28,17 +28,16 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
// Only used for GPU Operation(BufferTransform) // Only used for GPU Operation(BufferTransform)
template<typename T>
class OpenCLBufferTransformer { class OpenCLBufferTransformer {
public: public:
OpenCLBufferTransformer(const MemoryType in_mem_type, OpenCLBufferTransformer(const MemoryType in_mem_type,
const MemoryType out_mem_type) { const MemoryType out_mem_type) {
if (out_mem_type == MemoryType::GPU_IMAGE) { if (out_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BufferToImage<T>>(); kernel_ = make_unique<opencl::image::BufferToImage>();
} else if (in_mem_type == MemoryType::GPU_IMAGE) { } else if (in_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ImageToBuffer<T>>(); kernel_ = make_unique<opencl::image::ImageToBuffer>();
} else { } else {
kernel_ = make_unique<opencl::buffer::BufferTransform<T>>(); kernel_ = make_unique<opencl::buffer::BufferTransform>();
} }
} }
...@@ -49,7 +48,7 @@ class OpenCLBufferTransformer { ...@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
const int wino_blk_size, const int wino_blk_size,
Tensor *output) { Tensor *output) {
Workspace *ws = context->workspace(); Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value; DataType dt = output->dtype();
MemoryType in_mem_type = input->memory_type(); MemoryType in_mem_type = input->memory_type();
if (out_mem_type == MemoryType::GPU_IMAGE || if (out_mem_type == MemoryType::GPU_IMAGE ||
out_mem_type == MemoryType::GPU_BUFFER) { out_mem_type == MemoryType::GPU_BUFFER) {
...@@ -87,10 +86,10 @@ class OpenCLBufferTransformer { ...@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
<< " to CPU Buffer " << output->name() << " to CPU Buffer " << output->name()
<< " with data type " << dt; << " with data type " << dt;
Tensor::MappingGuard guard(&internal_tensor); Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>(); const float *internal_ptr = internal_tensor.data<float>();
output->Resize(internal_tensor.shape()); output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>(); float *output_ptr = output->mutable_data<float>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} else { } else {
LOG(FATAL) << "Unexpected error: " << out_mem_type; LOG(FATAL) << "Unexpected error: " << out_mem_type;
...@@ -110,30 +109,13 @@ class OpenCLBufferTransformer { ...@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
std::string TransformedFilterName(const std::string &name); std::string TransformedFilterName(const std::string &name);
template<typename T>
MaceStatus TransformFilter( MaceStatus TransformFilter(
mace::OpConstructContext *context, mace::OpConstructContext *context,
OperatorDef *op_def, OperatorDef *op_def,
const int input_idx, const int input_idx,
const OpenCLBufferType buffer_type, const OpenCLBufferType buffer_type,
const MemoryType mem_type, const MemoryType mem_type,
const int wino_blk_size = 0) { const int wino_blk_size = 0);
const DataType dt = DataTypeToEnum<T>::value;
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
#include <vector> #include <vector>
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
class OpContext; class OpContext;
......
...@@ -17,7 +17,10 @@ ...@@ -17,7 +17,10 @@
#include <vector> #include <vector>
#include "mace/ops/activation.h" #include "mace/core/types.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
namespace mace { namespace mace {
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#include <vector> #include <vector>
#include "mace/ops/common/activation_type.h" #include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
......
...@@ -15,8 +15,7 @@ ...@@ -15,8 +15,7 @@
#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_ #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_ #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
......
...@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) { ...@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
} }
} }
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime, std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws, const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
......
...@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape( ...@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
// CPU data type to OpenCL command data type // CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type // CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt); std::string DtToCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select // CPU data type to OpenCL condition data type used in select
// e.g. half -> float // e.g. half -> float
std::string DtToCLCondDt(const DataType dt); std::string DtToCLCondDt(const DataType dt);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/activation.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ActivationKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
switch (activation_) {
case RELU: {
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
}
case RELUX: {
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
}
case PRELU: {
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
}
case TANH: {
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
}
case SIGMOID: {
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
}
case LEAKYRELU: {
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation_;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,12 +31,11 @@ namespace ops { ...@@ -31,12 +31,11 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel { class ActivationKernel : public OpenCLActivationKernel {
public: public:
ActivationKernel(ActivationType type, ActivationKernel(ActivationType type,
T relux_max_limit, float relux_max_limit,
T leakyrelu_coefficient) float leakyrelu_coefficient)
: activation_(type), relux_max_limit_(relux_max_limit), : activation_(type), relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {} leakyrelu_coefficient_(leakyrelu_coefficient) {}
...@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel { ...@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
private: private:
ActivationType activation_; ActivationType activation_;
T relux_max_limit_; float relux_max_limit_;
T leakyrelu_coefficient_; float leakyrelu_coefficient_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
}; };
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/addn.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus AddNKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel { class AddNKernel : public OpenCLAddNKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel { ...@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_norm.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
BatchNormKernel::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
MaceStatus BatchNormKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
break;
case TANH:built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
namespace mace { namespace mace {
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel { class BatchNormKernel : public OpenCLBatchNormKernel {
public: public:
BatchNormKernel( BatchNormKernel(
...@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel { ...@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BatchToSpaceKernel::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = batch_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { ...@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/bias_add.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BiasAddKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel { class BiasAddKernel : public OpenCLBiasAddKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel { ...@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BufferToImage::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BufferToImage : public OpenCLBufferTransformKernel { class BufferToImage : public OpenCLBufferTransformKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel { ...@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:
kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/channel_shuffle.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ChannelShuffleKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public: public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {} explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
...@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { ...@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context, ...@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); auto data_dt = input0->dtype();
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
} }
if (input0->dim(3) % 4 == 0) { if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
...@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context, MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context, ...@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel)); built_options, kernel));
*kwg_size = *kwg_size =
...@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context, ...@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
} }
} // namespace concat } // namespace concat
MaceStatus ConcatKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
&input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context,
&kernel_,
input_list,
output,
&kwg_size_);
}
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
...@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context, ...@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context, MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
} // namespace concat } // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel { class ConcatKernel : public OpenCLConcatKernel {
public: public:
ConcatKernel() {} ConcatKernel() {}
...@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel { ...@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, &kwg_size_);
}
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
此差异已折叠。
...@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
extern MaceStatus Conv2dK1x1(OpContext *context, MaceStatus Conv2dK1x1(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context, ...@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context, ...@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
built_options, kernel)); built_options, kernel));
......
...@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
extern MaceStatus Conv2dK3x3(OpContext *context, MaceStatus Conv2dK3x3(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context, ...@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context, ...@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
built_options, kernel)); built_options, kernel));
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册