提交 85cef1d8 编写于 作者: L luxuhui

adjust opencl code to minify the libmace.so's size

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 23d985f7
...@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA) ...@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
enable_language(CUDA) enable_language(CUDA)
endif(MACE_ENABLE_CUDA) endif(MACE_ENABLE_CUDA)
if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)) if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(ANDROID_ABI STREQUAL "arm64-v8a") if(ANDROID_ABI STREQUAL "arm64-v8a")
# Use gold linker to avoid linking check of libcdsprpc.so # Use gold linker to avoid linking check of libcdsprpc.so
set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold") set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
......
...@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation { ...@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class MyCustomOp<DeviceType::GPU, T> : public Operation { class MyCustomOp<DeviceType::GPU, float> : public Operation {
... ...
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) { ...@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp, MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -5,7 +5,7 @@ package( ...@@ -5,7 +5,7 @@ package(
default_visibility = ["//visibility:public"], default_visibility = ["//visibility:public"],
) )
load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule") load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
cc_library( cc_library(
name = "generated_models", name = "generated_models",
...@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule() ...@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
cc_library( cc_library(
name = "generated_opencl", name = "generated_opencl",
srcs = ["opencl/encrypt_opencl_kernel.cc"], srcs = ["opencl/encrypt_opencl_kernel.cc"],
hdrs = ["opencl/encrypt_opencl_kernel.h"],
copts = [ copts = [
"-Werror", "-Werror",
"-Wextra", "-Wextra",
......
...@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation( ...@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
std::string key = OpKeyBuilder(op_type) std::string key = OpKeyBuilder(op_type)
.Device(device_type) .Device(device_type)
.TypeConstraint("T", dtype) .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
.Build(); .Build();
if (registry_.at(op_type)->creators.count(key) == 0) { if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key; LOG(FATAL) << "Key not registered: " << key;
......
...@@ -39,7 +39,7 @@ class OpConditionContext { ...@@ -39,7 +39,7 @@ class OpConditionContext {
OpConditionContext(const Workspace *ws, TensorShapeMap *info); OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default; ~OpConditionContext() = default;
void set_operator_def(const OperatorDef* operator_def); void set_operator_def(const OperatorDef *operator_def);
inline const OperatorDef *operator_def() const { inline const OperatorDef *operator_def() const {
return operator_def_; return operator_def_;
...@@ -49,7 +49,7 @@ class OpConditionContext { ...@@ -49,7 +49,7 @@ class OpConditionContext {
return ws_; return ws_;
} }
inline void set_device(Device* device) { inline void set_device(Device *device) {
device_ = device; device_ = device;
} }
...@@ -110,7 +110,7 @@ class OpConstructContext { ...@@ -110,7 +110,7 @@ class OpConstructContext {
return ws_; return ws_;
} }
inline void set_device(Device* device) { inline void set_device(Device *device) {
device_ = device; device_ = device;
} }
...@@ -166,14 +166,14 @@ class Operation { ...@@ -166,14 +166,14 @@ class Operation {
explicit Operation(OpConstructContext *context); explicit Operation(OpConstructContext *context);
virtual ~Operation() = default; virtual ~Operation() = default;
template <typename T> template<typename T>
inline T GetOptionalArg(const std::string &name, inline T GetOptionalArg(const std::string &name,
const T &default_value) const { const T &default_value) const {
MACE_CHECK(operator_def_, "operator_def was null!"); MACE_CHECK(operator_def_, "operator_def was null!");
return ProtoArgHelper::GetOptionalArg<OperatorDef, T>( return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
*operator_def_, name, default_value); *operator_def_, name, default_value);
} }
template <typename T> template<typename T>
inline std::vector<T> GetRepeatedArgs( inline std::vector<T> GetRepeatedArgs(
const std::string &name, const std::vector<T> &default_value = {}) const { const std::string &name, const std::vector<T> &default_value = {}) const {
MACE_CHECK(operator_def_, "operator_def was null!"); MACE_CHECK(operator_def_, "operator_def was null!");
...@@ -240,7 +240,6 @@ class Operation { ...@@ -240,7 +240,6 @@ class Operation {
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \ #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ } enum _OutputTags { first_input = 0, __VA_ARGS__ }
struct OpRegistrationInfo { struct OpRegistrationInfo {
public: public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)> typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
...@@ -290,7 +289,6 @@ class OpConditionBuilder { ...@@ -290,7 +289,6 @@ class OpConditionBuilder {
OpRegistrationInfo::DataFormatSelector data_format_selector_; OpRegistrationInfo::DataFormatSelector data_format_selector_;
}; };
class OpRegistryBase { class OpRegistryBase {
public: public:
OpRegistryBase() = default; OpRegistryBase() = default;
...@@ -315,7 +313,7 @@ class OpRegistryBase { ...@@ -315,7 +313,7 @@ class OpRegistryBase {
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type) const; DeviceType device_type) const;
template <class DerivedType> template<class DerivedType>
static std::unique_ptr<Operation> DefaultCreator( static std::unique_ptr<Operation> DefaultCreator(
OpConstructContext *context) { OpConstructContext *context) {
return std::unique_ptr<Operation>(new DerivedType(context)); return std::unique_ptr<Operation>(new DerivedType(context));
...@@ -334,6 +332,24 @@ class OpRegistryBase { ...@@ -334,6 +332,24 @@ class OpRegistryBase {
DataTypeToEnum<dt>::value, \ DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name<device, dt>>) OpRegistryBase::DefaultCreator<class_name<device, dt>>)
#define MACE_REGISTER_OP_BY_CLASS( \
op_registry, op_type, class_name, device, dt) \
op_registry->Register(op_type, \
device, \
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name>)
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
op_type, \
DeviceType::GPU, \
DT_FLOAT, \
OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
#else
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
#endif
#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \ #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
op_registry->Register(builder) op_registry->Register(builder)
......
...@@ -18,20 +18,19 @@ ...@@ -18,20 +18,19 @@
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include <mutex> // NOLINT(build/c++11) #include <mutex> // NOLINT(build/c++11)
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include "mace/utils/macros.h" #include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include "mace/core/kv_storage.h" #include "mace/core/kv_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h" #include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/utils/macros.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
namespace mace { namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) { const std::string OpenCLErrorToString(cl_int error) {
switch (error) { switch (error) {
case CL_SUCCESS: case CL_SUCCESS:
...@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
const GPUPriorityHint priority_hint, const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint, const GPUPerfHint perf_hint,
std::shared_ptr<KVStorage> precompiled_binary_storage, std::shared_ptr<KVStorage> precompiled_binary_storage,
std::shared_ptr<Tuner<uint32_t>> tuner): std::shared_ptr<Tuner<uint32_t>> tuner) :
cache_storage_(cache_storage), cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage), precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner), tuner_(tuner),
...@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
#if CL_HPP_TARGET_OPENCL_VERSION >= 200 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) { if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
std::vector<cl_context_properties> context_properties = { std::vector<cl_context_properties> context_properties = {
CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(), CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback, CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0 CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
}; };
context_ = std::shared_ptr<cl::Context>( context_ = std::shared_ptr<cl::Context>(
...@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( ...@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true; return true;
} }
MaceStatus GetProgramSourceByName(const std::string &program_name,
std::string *source) {
MACE_CHECK_NOTNULL(source);
std::stringstream source_stream;
const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
const auto &it_program = kEncryptedProgramMap.find(program_name);
if (it_program == kEncryptedProgramMap.end()) {
LOG(ERROR) << "Find program " << program_name << " failed.";
return MaceStatus::MACE_RUNTIME_ERROR;
}
const std::vector<std::string> &headers = it_program->second.headers_;
for (const std::string &header : headers) {
const auto &header_program = kEncryptedProgramMap.find(header);
if (header_program == kEncryptedProgramMap.end()) {
LOG(WARNING) << "Program header(" << header << ") is empty.";
continue;
}
const auto &header_source = header_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(header_source.begin(), header_source.end()));
}
const auto &it_source = it_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(it_source.begin(), it_source.end()));
*source = source_stream.str();
return MaceStatus::MACE_SUCCESS;
}
bool OpenCLRuntime::BuildProgramFromSource( bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name, const std::string &program_name,
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
cl::Program *program) { cl::Program *program) {
// Find from source std::string kernel_source;
auto it_source = kEncryptedProgramMap.find(program_name); MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
if (it_source != kEncryptedProgramMap.end()) { if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
cl::Program::Sources sources; cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source); sources.push_back(kernel_source);
*program = cl::Program(context(), sources); *program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str()); cl_int ret = program->build({device()}, build_options_str.c_str());
......
...@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
*net_def, "opencl_mem_type", *net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE)); static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i); const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type); runtime->set_mem_type(mem_type);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
......
...@@ -118,9 +118,21 @@ def mace_version_genrule(): ...@@ -118,9 +118,21 @@ def mace_version_genrule():
) )
def encrypt_opencl_kernel_genrule(): def encrypt_opencl_kernel_genrule():
srcs = [
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
)),
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
)),
]
outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
native.genrule( native.genrule(
name = "encrypt_opencl_kernel_gen", name = "encrypt_opencl_kernel_gen",
srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))], srcs = srcs,
outs = ["opencl/encrypt_opencl_kernel.cc"], outs = outs,
cmd = "cat $(SRCS) > $@;" cmd = " && ".join([
"cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
for i in range(0, len(outs))
]),
) )
...@@ -181,7 +181,6 @@ cc_library( ...@@ -181,7 +181,6 @@ cc_library(
], ],
) )
cc_library( cc_library(
name = "internal_ops", name = "internal_ops",
srcs = glob( srcs = glob(
......
...@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation { ...@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ActivationOp<DeviceType::GPU, T> : public Operation { class ActivationOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ActivationOp(OpConstructContext *context) explicit ActivationOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
ActivationType type = ops::StringToActivationType( ActivationType type = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
"NOOP")); "NOOP"));
auto relux_max_limit = static_cast<T>( auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
Operation::GetOptionalArg<float>("max_limit", 0.0f)); auto leakyrelu_coefficient =
auto leakyrelu_coefficient = static_cast<T>( Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>( kernel_ = make_unique<opencl::image::ActivationKernel>(
type, relux_max_limit, leakyrelu_coefficient); type, relux_max_limit, leakyrelu_coefficient);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (type == ActivationType::PRELU) { if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation { ...@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
void RegisterActivation(OpRegistryBase *op_registry) { void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Activation") OpConditionBuilder("Activation")
...@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) { ...@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class AddNOp; class AddNOp;
template <> template<>
class AddNOp<DeviceType::CPU, float> : public Operation { class AddNOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit AddNOp(OpConstructContext *context) explicit AddNOp(OpConstructContext *context)
...@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation { ...@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class AddNOp<DeviceType::GPU, T> : public Operation { class AddNOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit AddNOp(OpConstructContext *context) explicit AddNOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>(); kernel_ = make_unique<opencl::image::AddNKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation { ...@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterAddN(OpRegistryBase *op_registry) { void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("AddN") OpConditionBuilder("AddN")
...@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) { ...@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BatchNormOp<DeviceType::GPU, T> : public Operation { class BatchNormOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BatchNormOp(OpConstructContext *context) explicit BatchNormOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>( kernel_ = make_unique<opencl::image::BatchNormKernel>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient); epsilon, activation, relux_max_limit, leakyrelu_coefficient);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
const Tensor *input_tensor = context->workspace()->GetTensor( const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i)); operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr); MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
...@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
void RegisterBatchNorm(OpRegistryBase *op_registry) { void RegisterBatchNorm(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation { ...@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
} }
}; };
template <DeviceType D, class T> template<DeviceType D, class T>
class BatchToSpaceNDOp; class BatchToSpaceNDOp;
template <> template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
...@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
} }
}; };
template <> template<>
class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
...@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase { ...@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) { : BatchToSpaceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>(); kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase { ...@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float); BatchToSpaceNDOp, DeviceType::CPU, float);
...@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { ...@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t); BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -34,16 +34,16 @@ ...@@ -34,16 +34,16 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class BiasAddOp; class BiasAddOp;
template <> template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation { class BiasAddOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit BiasAddOp(OpConstructContext *context) explicit BiasAddOp(OpConstructContext *context)
: Operation(context), : Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)) has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
{} 0)) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
...@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation { ...@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class BiasAddOp<DeviceType::GPU, T> : public Operation { class BiasAddOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BiasAddOp(OpConstructContext *context) explicit BiasAddOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type = MemoryType::CPU_BUFFER; MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>(); kernel_ = make_unique<opencl::image::BiasAddKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterBiasAdd(OpRegistryBase *op_registry) { void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("BiasAdd") OpConditionBuilder("BiasAdd")
...@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { ...@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -23,10 +23,10 @@ ...@@ -23,10 +23,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class ChannelShuffleOp; class ChannelShuffleOp;
template <typename T> template<typename T>
class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit ChannelShuffleOp(OpConstructContext *context) explicit ChannelShuffleOp(OpConstructContext *context)
...@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { ...@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const int groups_; const int groups_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ChannelShuffleOp<DeviceType::GPU, T> : public Operation { class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ChannelShuffleOp(OpConstructContext *context) explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1); const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups); kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation { ...@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterChannelShuffle(OpRegistryBase *op_registry) { void RegisterChannelShuffle(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle", MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float); ChannelShuffleOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
...@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { ...@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1); *op, "group", 1);
if (op->output_shape(0).dims_size() != 4) { if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
index_t channels = op->output_shape(0).dims(3); index_t channels = op->output_shape(0).dims(3);
index_t channels_per_group = channels / groups; index_t channels_per_group = channels / groups;
if (groups % 4 != 0 || channels_per_group % 4 != 0) { if (groups % 4 != 0 || channels_per_group % 4 != 0) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_PAD_H_ #ifndef MACE_OPS_COMMON_PAD_TYPE_H_
#define MACE_OPS_PAD_H_ #define MACE_OPS_COMMON_PAD_TYPE_H_
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -27,4 +27,4 @@ enum PadType { ...@@ -27,4 +27,4 @@ enum PadType {
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_PAD_H_ #endif // MACE_OPS_COMMON_PAD_TYPE_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_POOLING_H_ #ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
#define MACE_OPS_POOLING_H_ #define MACE_OPS_COMMON_POOLING_TYPE_H_
namespace mace { namespace mace {
...@@ -23,4 +23,4 @@ enum PoolingType { ...@@ -23,4 +23,4 @@ enum PoolingType {
}; };
} // namespace mace } // namespace mace
#endif // MACE_OPS_POOLING_H_ #endif // MACE_OPS_COMMON_POOLING_TYPE_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_REDUCE_H_ #ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
#define MACE_OPS_REDUCE_H_ #define MACE_OPS_COMMON_REDUCE_TYPE_H_
namespace mace { namespace mace {
...@@ -28,4 +28,4 @@ enum ReduceType { ...@@ -28,4 +28,4 @@ enum ReduceType {
}; };
} // namespace mace } // namespace mace
#endif // MACE_OPS_REDUCE_H_ #endif // MACE_OPS_COMMON_REDUCE_TYPE_H_
...@@ -12,14 +12,16 @@ ...@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_RESIZE_BICUBIC_H_ #ifndef MACE_OPS_COMMON_UTILS_H_
#define MACE_OPS_RESIZE_BICUBIC_H_ #define MACE_OPS_COMMON_UTILS_H_
#include "mace/core/types.h" #include "mace/core/types.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace resize_bicubic { namespace common {
namespace utils {
constexpr int64_t kTableSize = (1u << 10); constexpr int64_t kTableSize = (1u << 10);
inline float CalculateResizeScale(index_t in_size, inline float CalculateResizeScale(index_t in_size,
...@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size, ...@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
? (in_size - 1) / static_cast<float>(out_size - 1) ? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size); : in_size / static_cast<float>(out_size);
} }
} // namespace resize_bicubic
} // namespace utils
} // namespace common
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_RESIZE_BICUBIC_H_ #endif // MACE_OPS_COMMON_UTILS_H_
...@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation { ...@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
int axis_; int axis_;
}; };
template <DeviceType D, class T> template<DeviceType D, class T>
class ConcatOp; class ConcatOp;
template <typename T> template<typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase { class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public: public:
explicit ConcatOp(OpConstructContext *context) explicit ConcatOp(OpConstructContext *context)
...@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase { ...@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase { class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
public: public:
explicit ConcatOp(OpConstructContext *context) explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) { : ConcatOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>(); kernel_ = make_unique<opencl::image::ConcatKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase { ...@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterConcat(OpRegistryBase *op_registry) { void RegisterConcat(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float); DeviceType::CPU, float);
...@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
...@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
auto tensor_shape_info = context->tensor_shape_info(); auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) { if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} else { } else {
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
...@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3); *op, "axis", 3);
if (!has_data_format || axis != 3) { if (!has_data_format || axis != 3) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
bool divisible_four = true; bool divisible_four = true;
for (const std::string &input : op->input()) { for (const std::string &input : op->input()) {
...@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) {
} }
// Only support not divisible 4 case with 2 inputs. // Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) { if (op->input_size() > 2 && !divisible_four) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase { ...@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
public: public:
explicit Conv2dOp(OpConstructContext *context) explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context), : ConvPool2dOpBase(context),
...@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>(); kernel_ = make_unique<opencl::image::Conv2dKernel>();
} else { } else {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
} }
// Transform filter tensor to target format // Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) && if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
...@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
strides_.data(), strides_.data(),
dilations_.data(), dilations_.data(),
&wino_block_size_))) { &wino_block_size_))) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} else { } else {
wino_block_size_ = 0; wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type) OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) { ...@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -24,10 +24,10 @@ ...@@ -24,10 +24,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class CropOp; class CropOp;
template <class T> template<class T>
class CropOp<DeviceType::CPU, T> : public Operation { class CropOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit CropOp(OpConstructContext *context) explicit CropOp(OpConstructContext *context)
...@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs."); MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
...@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
const T * input_data = input0->data<T>(); const T *input_data = input0->data<T>();
crop_copy(input_data, output_data, input0->shape(), crop_copy(input_data, output_data, input0->shape(),
output_shape, offsets.data()); output_shape, offsets.data());
...@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
} }
private: private:
void crop_copy(const T* input_data, T* output_data, void crop_copy(const T *input_data, T *output_data,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const int32_t* offsets) { const int32_t *offsets) {
const index_t out_img_size = const index_t out_img_size =
output_shape[1] * output_shape[2] * output_shape[3]; output_shape[1] * output_shape[2] * output_shape[3];
const index_t out_hw = output_shape[2] * output_shape[3]; const index_t out_hw = output_shape[2] * output_shape[3];
...@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
for (int b = 0; b < output_shape[0]; ++b) { for (int b = 0; b < output_shape[0]; ++b) {
for (int c = 0; c < output_shape[1]; ++c) { for (int c = 0; c < output_shape[1]; ++c) {
for (int h = 0; h < output_shape[2]; ++h) { for (int h = 0; h < output_shape[2]; ++h) {
T* out_ptr = T *out_ptr =
output_data + b * out_img_size + c * out_hw + h * output_shape[3]; output_data + b * out_img_size + c * out_hw + h * output_shape[3];
const T* in_ptr_bch = const T *in_ptr_bch =
input_data + (b + offsets[0]) * in_img_size + input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw + (c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3]; (h + offsets[2]) * input_shape[3] + offsets[3];
...@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class CropOp<DeviceType::GPU, T> : public Operation { class CropOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit CropOp(OpConstructContext *context) explicit CropOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>( kernel_ = make_unique<opencl::image::CropKernel>(
Operation::GetRepeatedArgs<int>("offset")); Operation::GetRepeatedArgs<int>("offset"));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation { ...@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistryBase *op_registry) { void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp, MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Crop") OpConditionBuilder("Crop")
...@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) { ...@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
int has_data_format = int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0); *op, "has_data_format", 0);
if (!has_data_format || if (!has_data_format ||
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<>
class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public: public:
explicit Deconv2dOp(OpConstructContext *context) explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>(); kernel_ = make_unique<opencl::image::Deconv2dKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type) OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) { if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) { if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
} }
} else { } else {
if (operator_def_->input_size() >= 4) { if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
3, 3,
...@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { ...@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
void RegisterDeconv2D(OpRegistryBase *op_registry) { void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Deconv2D") OpConditionBuilder("Deconv2D")
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class DepthToSpaceOp : public Operation { class DepthToSpaceOp : public Operation {
public: public:
explicit DepthToSpaceOp(OpConstructContext *context) explicit DepthToSpaceOp(OpConstructContext *context)
...@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation { ...@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthToSpaceOp<DeviceType::GPU, T> : public Operation { class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit DepthToSpaceOp(OpConstructContext *context) explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1); int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size); kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) { ...@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace", MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float); DepthToSpaceOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t> ...@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
public: public:
explicit DepthwiseConv2dOp(OpConstructContext *context) explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) { : DepthwiseConv2dOpBase(context) {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>(); kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
} else { } else {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
} }
Tensor *filter_tensor = context->workspace()->GetTensor( Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1)); operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) { if (filter_tensor != nullptr && filter_tensor->is_weight()) {
// Transform filter tensor to target format // Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
1, 1,
...@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { ...@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} }
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { ...@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DepthwiseConv2dOp, DeviceType::CPU, uint8_t); DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", #ifdef MACE_ENABLE_OPENCL
DepthwiseConv2dOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("DepthwiseConv2d") OpConditionBuilder("DepthwiseConv2d")
......
...@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public: public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context) explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>(); kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type) OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) { if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
} }
...@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) { ...@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float); DepthwiseDeconv2dOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class EltwiseOp<DeviceType::GPU, T> : public Operation { class EltwiseOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit EltwiseOp(OpConstructContext *context) explicit EltwiseOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation { ...@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type; MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>( kernel_ = make_unique<opencl::image::EltwiseKernel>(
type, coeff, scalar_input, scalar_input_index); type, coeff, scalar_input, scalar_input_index);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation { ...@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
if (ws->HasTensor(operator_def_->input(i)) && if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) { ws->GetTensor(operator_def_->input(i))->is_weight()) {
if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) { if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
OpenCLBufferType::ARGUMENT, OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) { } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
i, i,
...@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) { ...@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t> ...@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase { class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
public: public:
explicit FullyConnectedOp(OpConstructContext *context) explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) { : FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER; MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE; mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>(); kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
// Transform filter tensor to target format // Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, context,
operator_def_.get(), operator_def_.get(),
1, 1,
OpenCLBufferType::WEIGHT_WIDTH, OpenCLBufferType::WEIGHT_WIDTH,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS); == MaceStatus::MACE_SUCCESS);
} }
...@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) { ...@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
FullyConnectedOp, DeviceType::CPU, uint8_t); FullyConnectedOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T>
class IdentityOp : public Operation { class IdentityOp : public Operation {
public: public:
explicit IdentityOp(OpConstructContext *context) explicit IdentityOp(OpConstructContext *context)
...@@ -34,15 +33,13 @@ class IdentityOp : public Operation { ...@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
}; };
void RegisterIdentity(OpRegistryBase *op_registry) { void RegisterIdentity(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float); DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T>
class InferConv2dShapeOp : public Operation { class InferConv2dShapeOp : public Operation {
public: public:
explicit InferConv2dShapeOp(OpConstructContext *context) explicit InferConv2dShapeOp(OpConstructContext *context)
...@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation { ...@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation {
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1; out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
} else { } else {
switch (padding_type) { switch (padding_type) {
case SAME: case SAME: {
out_h = (in_h + strides[0] - 1) / strides[0]; out_h = (in_h + strides[0] - 1) / strides[0];
out_w = (in_w + strides[1] - 1) / strides[1]; out_w = (in_w + strides[1] - 1) / strides[1];
break; break;
case VALID: }
case VALID: {
out_h = (in_h - kernels[2] + 1) / strides[0]; out_h = (in_h - kernels[2] + 1) / strides[0];
out_w = (in_w - kernels[3] + 1) / strides[1]; out_w = (in_w - kernels[3] + 1) / strides[1];
break; break;
default: }
default: {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
break; break;
} }
} }
}
if (isNCHW) { if (isNCHW) {
output_data[0] = out_batch; output_data[0] = out_batch;
...@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation { ...@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
}; };
void RegisterInferConv2dShape(OpRegistryBase *op_registry) { void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float); InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t); InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float); InferConv2dShapeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase { ...@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase { class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
public: public:
explicit MatMulOp(OpConstructContext *context) explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) { : MatMulOpBase(context) {
...@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase { ...@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
}; };
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) { void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float); DeviceType::CPU, float);
...@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) { ...@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
...@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter( ...@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t out_chan = input->dim(0); const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1); const index_t in_chan = input->dim(1);
...@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter( ...@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
MACE_OUT_OF_RANGE_CONFIG; MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name); built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter( ...@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t multiplier = input->dim(0); const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1); const index_t in_chan = input->dim(1);
...@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter( ...@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name); built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -164,7 +164,6 @@ MaceStatus TransformArgument( ...@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t size = input->dim(0); const index_t size = input->dim(0);
...@@ -181,8 +180,9 @@ MaceStatus TransformArgument( ...@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name); built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
...@@ -229,6 +229,30 @@ MaceStatus TransformArgument( ...@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus BufferTransform::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, output);
default:
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform( ...@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformConv2DFilter( MaceStatus TransformConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformDWConv2DFilter( MaceStatus TransformDWConv2DFilter(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
MaceStatus TransformArgument( MaceStatus TransformArgument(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output); Tensor *output);
class BufferTransform : public OpenCLBufferTransformKernel {
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
...@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel { ...@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, dt, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input, dt, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform( ...@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
OpContext *context, OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const DataType dt,
Tensor *output) { Tensor *output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
...@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform( ...@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name); built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name, kernel_name,
built_options, built_options,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(kwg_size_);
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context, ...@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, ...@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context, ...@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
StatsFuture *future); StatsFuture *future);
} // namespace conv2d } // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel { class Conv2dKernel : public OpenCLConv2dKernel {
public: public:
Conv2dKernel() : old_scratch_size_(0) {} Conv2dKernel() : old_scratch_size_(0) {}
...@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel { ...@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context, ...@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context, ...@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name); built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); std::string data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
......
...@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context, ...@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context, ...@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name); built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); std::string pad_data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); std::string out_data_dt = DtToCLDt(output->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
......
...@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
} }
} // namespace depthwise } // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias, const Tensor *bias,
const int *strides, const int *strides,
const int *dilations, const int *dilations,
const DataType dt,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
...@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
StatsFuture *future); StatsFuture *future);
} // namespace depthwise } // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public: public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {} DepthwiseConv2dKernel() : old_scratch_size_(0) {}
...@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { ...@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
private: private:
index_t old_scratch_size_; index_t old_scratch_size_;
cl::Kernel kernels_[2]; cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
auto input_dtype = input->dtype();
auto input_dt = DtToCLDt(input_dtype);
built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
auto output_dtype = output->dtype();
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
if (pooling_type == MAX && input_dtype == output_dtype) {
built_options.emplace("-DDATA_TYPE=" + input_dt);
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace buffer { namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel { class PoolingKernel : public OpenCLPoolingKernel {
public: public:
PoolingKernel() : old_scratch_size_(0) {} PoolingKernel() : old_scratch_size_(0) {}
...@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel { ...@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -29,7 +29,7 @@ namespace mace { ...@@ -29,7 +29,7 @@ namespace mace {
namespace ops { namespace ops {
namespace opencl { namespace opencl {
namespace buffer { namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel { class SoftmaxKernel : public OpenCLSoftmaxKernel {
public: public:
explicit SoftmaxKernel(bool use_log) explicit SoftmaxKernel(bool use_log)
...@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { ...@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer } // namespace buffer
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -20,11 +20,11 @@ ...@@ -20,11 +20,11 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class BufferTransformOp; class BufferTransformOp;
template <typename T> template<>
class BufferTransformOp<DeviceType::GPU, T> : public Operation { class BufferTransformOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit BufferTransformOp(OpConstructContext *context) explicit BufferTransformOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType in_mem_type = context->workspace()->GetTensor( MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type(); operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform( return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output); context, input, type, out_mem_type_, wino_blk_size_, output);
} }
...@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType out_mem_type_; MemoryType out_mem_type_;
}; };
void RegisterBufferTransform(OpRegistryBase *op_registry) { void RegisterBufferTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferTransform", MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
BufferTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, half);
} }
} // namespace ops } // namespace ops
......
...@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) { ...@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
return name + postfix; return name + postfix;
} }
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size) {
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
const DataType dt = input->dtype();
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -28,17 +28,16 @@ ...@@ -28,17 +28,16 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
// Only used for GPU Operation(BufferTransform) // Only used for GPU Operation(BufferTransform)
template<typename T>
class OpenCLBufferTransformer { class OpenCLBufferTransformer {
public: public:
OpenCLBufferTransformer(const MemoryType in_mem_type, OpenCLBufferTransformer(const MemoryType in_mem_type,
const MemoryType out_mem_type) { const MemoryType out_mem_type) {
if (out_mem_type == MemoryType::GPU_IMAGE) { if (out_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BufferToImage<T>>(); kernel_ = make_unique<opencl::image::BufferToImage>();
} else if (in_mem_type == MemoryType::GPU_IMAGE) { } else if (in_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ImageToBuffer<T>>(); kernel_ = make_unique<opencl::image::ImageToBuffer>();
} else { } else {
kernel_ = make_unique<opencl::buffer::BufferTransform<T>>(); kernel_ = make_unique<opencl::buffer::BufferTransform>();
} }
} }
...@@ -49,7 +48,7 @@ class OpenCLBufferTransformer { ...@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
const int wino_blk_size, const int wino_blk_size,
Tensor *output) { Tensor *output) {
Workspace *ws = context->workspace(); Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value; DataType dt = output->dtype();
MemoryType in_mem_type = input->memory_type(); MemoryType in_mem_type = input->memory_type();
if (out_mem_type == MemoryType::GPU_IMAGE || if (out_mem_type == MemoryType::GPU_IMAGE ||
out_mem_type == MemoryType::GPU_BUFFER) { out_mem_type == MemoryType::GPU_BUFFER) {
...@@ -87,10 +86,10 @@ class OpenCLBufferTransformer { ...@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
<< " to CPU Buffer " << output->name() << " to CPU Buffer " << output->name()
<< " with data type " << dt; << " with data type " << dt;
Tensor::MappingGuard guard(&internal_tensor); Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>(); const float *internal_ptr = internal_tensor.data<float>();
output->Resize(internal_tensor.shape()); output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>(); float *output_ptr = output->mutable_data<float>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} else { } else {
LOG(FATAL) << "Unexpected error: " << out_mem_type; LOG(FATAL) << "Unexpected error: " << out_mem_type;
...@@ -110,30 +109,13 @@ class OpenCLBufferTransformer { ...@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
std::string TransformedFilterName(const std::string &name); std::string TransformedFilterName(const std::string &name);
template<typename T>
MaceStatus TransformFilter( MaceStatus TransformFilter(
mace::OpConstructContext *context, mace::OpConstructContext *context,
OperatorDef *op_def, OperatorDef *op_def,
const int input_idx, const int input_idx,
const OpenCLBufferType buffer_type, const OpenCLBufferType buffer_type,
const MemoryType mem_type, const MemoryType mem_type,
const int wino_blk_size = 0) { const int wino_blk_size = 0);
const DataType dt = DataTypeToEnum<T>::value;
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
#include <vector> #include <vector>
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
class OpContext; class OpContext;
......
...@@ -17,7 +17,10 @@ ...@@ -17,7 +17,10 @@
#include <vector> #include <vector>
#include "mace/ops/activation.h" #include "mace/core/types.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
namespace mace { namespace mace {
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#include <vector> #include <vector>
#include "mace/ops/common/activation_type.h" #include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
......
...@@ -15,8 +15,7 @@ ...@@ -15,8 +15,7 @@
#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_ #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_ #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
......
...@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) { ...@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
} }
} }
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime, std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws, const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
......
...@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape( ...@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
// CPU data type to OpenCL command data type // CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type // CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt); std::string DtToCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select // CPU data type to OpenCL condition data type used in select
// e.g. half -> float // e.g. half -> float
std::string DtToCLCondDt(const DataType dt); std::string DtToCLCondDt(const DataType dt);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/activation.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ActivationKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
switch (activation_) {
case RELU: {
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
}
case RELUX: {
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
}
case PRELU: {
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
}
case TANH: {
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
}
case SIGMOID: {
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
}
case LEAKYRELU: {
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation_;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,12 +31,11 @@ namespace ops { ...@@ -31,12 +31,11 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel { class ActivationKernel : public OpenCLActivationKernel {
public: public:
ActivationKernel(ActivationType type, ActivationKernel(ActivationType type,
T relux_max_limit, float relux_max_limit,
T leakyrelu_coefficient) float leakyrelu_coefficient)
: activation_(type), relux_max_limit_(relux_max_limit), : activation_(type), relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {} leakyrelu_coefficient_(leakyrelu_coefficient) {}
...@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel { ...@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
private: private:
ActivationType activation_; ActivationType activation_;
T relux_max_limit_; float relux_max_limit_;
T leakyrelu_coefficient_; float leakyrelu_coefficient_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
}; };
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/addn.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus AddNKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel { class AddNKernel : public OpenCLAddNKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel { ...@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_norm.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
BatchNormKernel::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
MaceStatus BatchNormKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
break;
case TANH:built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/activation.h" #include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
namespace mace { namespace mace {
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel { class BatchNormKernel : public OpenCLBatchNormKernel {
public: public:
BatchNormKernel( BatchNormKernel(
...@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel { ...@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BatchToSpaceKernel::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = batch_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { ...@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/bias_add.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BiasAddKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel { class BiasAddKernel : public OpenCLBiasAddKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel { ...@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BufferToImage::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class BufferToImage : public OpenCLBufferTransformKernel { class BufferToImage : public OpenCLBufferTransformKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel { ...@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:
kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/channel_shuffle.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ChannelShuffleKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public: public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {} explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
...@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { ...@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context, ...@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); auto data_dt = input0->dtype();
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
} }
if (input0->dim(3) % 4 == 0) { if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
...@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context, MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context, ...@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel)); built_options, kernel));
*kwg_size = *kwg_size =
...@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context, ...@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
} }
} // namespace concat } // namespace concat
MaceStatus ConcatKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
&input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context,
&kernel_,
input_list,
output,
&kwg_size_);
}
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context, ...@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
...@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context, ...@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context, MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
} // namespace concat } // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel { class ConcatKernel : public OpenCLConcatKernel {
public: public:
ConcatKernel() {} ConcatKernel() {}
...@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel { ...@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, &kwg_size_);
}
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context, ...@@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
...@@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context, ...@@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
...@@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context, ...@@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
...@@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
const int wino_blk_size, const int wino_blk_size,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size[3]); uint32_t *kwg_size[3]);
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel { class Conv2dKernel : public OpenCLConv2dKernel {
public: public:
bool CheckUseWinograd( bool CheckUseWinograd(
...@@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel { ...@@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
extern MaceStatus Conv2dK1x1(OpContext *context, MaceStatus Conv2dK1x1(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context, ...@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context, ...@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
built_options, kernel)); built_options, kernel));
......
...@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
extern MaceStatus Conv2dK3x3(OpContext *context, MaceStatus Conv2dK3x3(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context, ...@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context, ...@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
built_options, kernel)); built_options, kernel));
......
...@@ -67,7 +67,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -67,7 +67,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
extern MaceStatus Conv2d(OpContext *context, MaceStatus Conv2d(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -78,7 +78,6 @@ extern MaceStatus Conv2d(OpContext *context, ...@@ -78,7 +78,6 @@ extern MaceStatus Conv2d(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -101,30 +100,37 @@ extern MaceStatus Conv2d(OpContext *context, ...@@ -101,30 +100,37 @@ extern MaceStatus Conv2d(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
built_options, kernel)); built_options, kernel));
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/crop.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus CropKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output) {
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
for (index_t i = 0; i < in0_dims; ++i) {
if (offset_[i] >= 0) {
output_shape[i] = input1->dim(i);
offsets[i] = offset_[i];
MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
<< "the crop for dimension " << i
<< " is out of bound, first input size "
<< input0->dim(i) << ", offset " << offsets[i]
<< ", second input size " << input1->dim(i);
}
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel"
" offset divisible by 4.");
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = input0->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class CropKernel : public OpenCLCropKernel { class CropKernel : public OpenCLCropKernel {
public: public:
explicit CropKernel( explicit CropKernel(
...@@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel { ...@@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus CropKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output) {
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
for (index_t i = 0; i < in0_dims; ++i) {
if (offset_[i] >= 0) {
output_shape[i] = input1->dim(i);
offsets[i] = offset_[i];
MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
<< "the crop for dimension " << i
<< " is out of bound, first input size "
<< input0->dim(i) << ", offset " << offsets[i]
<< ", second input size " << input1->dim(i);
}
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel"
" offset divisible by 4.");
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/deconv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus Deconv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class Deconv2dKernel : public OpenCLDeconv2dKernel { class Deconv2dKernel : public OpenCLDeconv2dKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel { ...@@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus Deconv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const DataType dt = DataTypeToEnum<T>::value;
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/depth_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus DepthToSpaceKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size ",
input_depth);
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
"output channel not support:") << output_depth;
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
uint32_t gws[3];
if (output_depth < 3) {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
} else {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "depth_to_space";
if (output_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", output_depth));
if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
public: public:
explicit DepthToSpaceKernel(const int block_size) explicit DepthToSpaceKernel(const int block_size)
...@@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { ...@@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus DepthToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size ",
input_depth);
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
"output channel not support:") << output_depth;
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
uint32_t gws[3];
if (output_depth < 3) {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
} else {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "depth_to_space";
if (output_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", output_depth));
if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size) { uint32_t *kwg_size) {
...@@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace(MakeString("-DSTRIDE=", stride)); built_options.emplace(MakeString("-DSTRIDE=", stride));
switch (activation) { switch (activation) {
...@@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context,
} }
} // namespace depthwise } // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< " is not implemented yet, using slow version";
MACE_NOT_IMPLEMENTED;
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return depthwise::DepthwiseConv2d(
context, &kernel_, input, filter, bias, strides[0], paddings.data(),
dilations, activation, relux_max_limit, leakyrelu_coefficient,
&input_shape_, output, &kwg_size_);
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context, ...@@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
uint32_t *kwg_size); uint32_t *kwg_size);
} // namespace depthwise } // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { ...@@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< " is not implemented yet, using slow version";
MACE_NOT_IMPLEMENTED;
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return depthwise::DepthwiseConv2d(
context, &kernel_, input, filter, bias, strides[0], paddings.data(),
dilations, activation, relux_max_limit, leakyrelu_coefficient,
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/depthwise_deconv2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus DepthwiseDeconv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const int group,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
const index_t batch = output_shape[0];
const index_t height = output_shape[1];
const index_t width = output_shape[2];
const index_t channels = output_shape[3];
const index_t input_channels = input->dim(3);
const index_t multiplier = filter->dim(0);
MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
"opencl image deconv only supports depthwise type group.");
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t channel_blocks = RoundUpDiv4(channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("depthwise_deconv2d_kernel_",
activation,
output->dim(0),
output->dim(1),
output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel { class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel { ...@@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const int group,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
const index_t batch = output_shape[0];
const index_t height = output_shape[1];
const index_t width = output_shape[2];
const index_t channels = output_shape[3];
const index_t input_channels = input->dim(3);
const index_t multiplier = filter->dim(0);
MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
"opencl image deconv only supports depthwise type group.");
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const DataType dt = DataTypeToEnum<T>::value;
const index_t channel_blocks = RoundUpDiv4(channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("depthwise_deconv2d_kernel_",
activation,
output->dim(0),
output->dim(1),
output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/eltwise.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus EltwiseKernel::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
bool swapped = false;
std::string input1_type = "";
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1
|| (input1->dim(0) == 1 && input1->dim(1) == 1
&& input1->dim(2) == 1)) {
// Tensor-Vector element wise
if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
input1_type = "INPUT_VECTOR";
} else {
LOG(FATAL) << "Inputs not match the broadcast logic, "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
} else { // must be 4-D
if (input0->dim(0) == input1->dim(0)
&& input1->dim(1) == 1
&& input1->dim(2) == 1
&& input0->dim(3) == input1->dim(3)) {
input1_type = "INPUT_BATCH_VECTOR";
} else if (input0->dim(0) == input1->dim(0)
&& input0->dim(1) == input1->dim(1)
&& input0->dim(2) == input1->dim(2)
&& input1->dim(3) == 1) {
// broadcast on channel dimension
input1_type = "INPUT_TENSOR_BC_CHAN";
} else {
LOG(FATAL) << "Element-Wise op only support broadcast on"
" channel dimension:"
"Tensor-BatchVector(4D-[N,1,1,C]) "
"and Tensor-Tensor(4D-[N,H,W,1]). but got "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!input1_type.empty()) {
built_options.emplace("-D" + input1_type);
}
if (swapped) built_options.emplace("-DSWAPPED");
if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/eltwise.h" #include "mace/ops/common/eltwise_type.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
namespace mace { namespace mace {
...@@ -32,7 +32,6 @@ namespace ops { ...@@ -32,7 +32,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class EltwiseKernel : public OpenCLEltwiseKernel { class EltwiseKernel : public OpenCLEltwiseKernel {
public: public:
explicit EltwiseKernel( explicit EltwiseKernel(
...@@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel { ...@@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus EltwiseKernel<T>::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
bool swapped = false;
std::string input1_type = "";
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1
|| (input1->dim(0) == 1 && input1->dim(1) == 1
&& input1->dim(2) == 1)) {
// Tensor-Vector element wise
if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
input1_type = "INPUT_VECTOR";
} else {
LOG(FATAL) << "Inputs not match the broadcast logic, "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
} else { // must be 4-D
if (input0->dim(0) == input1->dim(0)
&& input1->dim(1) == 1
&& input1->dim(2) == 1
&& input0->dim(3) == input1->dim(3)) {
input1_type = "INPUT_BATCH_VECTOR";
} else if (input0->dim(0) == input1->dim(0)
&& input0->dim(1) == input1->dim(1)
&& input0->dim(2) == input1->dim(2)
&& input1->dim(3) == 1) {
// broadcast on channel dimension
input1_type = "INPUT_TENSOR_BC_CHAN";
} else {
LOG(FATAL) << "Element-Wise op only support broadcast on"
" channel dimension:"
"Tensor-BatchVector(4D-[N,1,1,C]) "
"and Tensor-Tensor(4D-[N,H,W,1]). but got "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!input1_type.empty()) {
built_options.emplace("-D" + input1_type);
}
if (swapped) built_options.emplace("-DSWAPPED");
if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/fully_connected.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus FullyConnectedKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, &kernel_));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
} else {
gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
}
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws_[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel_.setArg(idx++, static_cast<int>(output_blocks));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws_.size());
for (size_t i = 0; i < lws_.size(); ++i) {
roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
}
MACE_OUT_OF_RANGE_VALIDATION;
MACE_CL_RET_STATUS(error);
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
namespace mace { namespace mace {
...@@ -30,7 +31,6 @@ namespace ops { ...@@ -30,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class FullyConnectedKernel : public OpenCLFullyConnectedKernel { class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel { ...@@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus FullyConnectedKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, &kernel_));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
} else {
gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
}
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws_[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel_.setArg(idx++, static_cast<int>(output_blocks));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws_.size());
for (size_t i = 0; i < lws_.size(); ++i) {
roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
}
MACE_OUT_OF_RANGE_VALIDATION;
MACE_CL_RET_STATUS(error);
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/image_to_buffer.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ImageToBuffer::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_image_to_buffer";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_image_to_buffer";
break;
case ARGUMENT:kernel_name = "arg_image_to_buffer";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_image_to_buffer";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
case WEIGHT_HEIGHT:kernel_name = "weight_height_image_to_buffer";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_image_to_buffer";
break;
case DW_CONV2D_FILTER:
case IN_OUT_WIDTH:LOG(FATAL)
<< "IN_OUT_WIDTH only support buffer to image now";
break;
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (output->dtype() == input->dtype()) {
auto data_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(output->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t
inner_size = output->dim(1) * output->dim(2) * output->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(input->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -28,7 +28,6 @@ namespace ops { ...@@ -28,7 +28,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ImageToBuffer : public OpenCLBufferTransformKernel { class ImageToBuffer : public OpenCLBufferTransformKernel {
public: public:
MaceStatus Compute(OpContext *context, MaceStatus Compute(OpContext *context,
...@@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel { ...@@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_image_to_buffer";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_image_to_buffer";
break;
case ARGUMENT:
kernel_name = "arg_image_to_buffer";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_image_to_buffer";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
case WEIGHT_HEIGHT:
kernel_name = "weight_height_image_to_buffer";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_image_to_buffer";
break;
case DW_CONV2D_FILTER:
case IN_OUT_WIDTH:
LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now";
break;
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (output->dtype() == input->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(output->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t
inner_size = output->dim(1) * output->dim(2) * output->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(input->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/lstm_cell.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus LSTMCellKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
"LSTM hidden units should be a multiple of 4");
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape_padded,
OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, forget_bias_);
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,11 +30,10 @@ namespace ops { ...@@ -30,11 +30,10 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class LSTMCellKernel : public OpenCLLSTMCellKernel { class LSTMCellKernel : public OpenCLLSTMCellKernel {
public: public:
explicit LSTMCellKernel( explicit LSTMCellKernel(
const T forget_bias) const float forget_bias)
: forget_bias_(forget_bias) {} : forget_bias_(forget_bias) {}
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
...@@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel { ...@@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
Tensor *output) override; Tensor *output) override;
private: private:
T forget_bias_; float forget_bias_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus LSTMCellKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
"LSTM hidden units should be a multiple of 4");
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape_padded,
OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, static_cast<float>(forget_bias_));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/matmul.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus MatMulKernel::Compute(
OpContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b) {
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
OpenCLUtil::CalImage2DShape(padded_c_shape,
OpenCLBufferType::IN_OUT_HEIGHT,
&c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class MatMulKernel : public OpenCLMatMulKernel { class MatMulKernel : public OpenCLMatMulKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel { ...@@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel {
uint32_t kwg_size_; uint32_t kwg_size_;
}; };
template <typename T>
MaceStatus MatMulKernel<T>::Compute(
OpContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b) {
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
OpenCLUtil::CalImage2DShape(padded_c_shape,
OpenCLBufferType::IN_OUT_HEIGHT,
&c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/pad.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus PadKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
for (int i = 2; i <= 5; ++i) {
MACE_CHECK(paddings_[i] >= 0);
}
auto input_shape = input->shape();
if (type_ == PadType::REFLECT) {
MACE_CHECK(paddings_[2] < input_shape[1] &&
paddings_[3] < input_shape[1] &&
paddings_[4] < input_shape[2] &&
paddings_[5] < input_shape[2]);
} else if (type_ == PadType::SYMMETRIC) {
MACE_CHECK(paddings_[2] <= input_shape[1] &&
paddings_[3] <= input_shape[1] &&
paddings_[4] <= input_shape[2] &&
paddings_[5] <= input_shape[2]);
} else {
MACE_CHECK(type_ == PadType::CONSTANT);
}
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(MakeString("-DPAD_TYPE=", type_));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
if (type_ == PadType::CONSTANT) {
kernel_.setArg(idx++, this->constant_value_);
}
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/pad.h" #include "mace/ops/common/pad_type.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
namespace mace { namespace mace {
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class PadKernel : public OpenCLPadKernel { class PadKernel : public OpenCLPadKernel {
public: public:
PadKernel(const PadType type, PadKernel(const PadType type,
...@@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel { ...@@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus PadKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
for (int i = 2; i <= 5; ++i) {
MACE_CHECK(paddings_[i] >= 0);
}
auto input_shape = input->shape();
if (type_ == PadType::REFLECT) {
MACE_CHECK(paddings_[2] < input_shape[1] &&
paddings_[3] < input_shape[1] &&
paddings_[4] < input_shape[2] &&
paddings_[5] < input_shape[2]);
} else if (type_ == PadType::SYMMETRIC) {
MACE_CHECK(paddings_[2] <= input_shape[1] &&
paddings_[3] <= input_shape[1] &&
paddings_[4] <= input_shape[2] &&
paddings_[5] <= input_shape[2]);
} else {
MACE_CHECK(type_ == PadType::CONSTANT);
}
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(MakeString("-DPAD_TYPE=", type_));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
if (type_ == PadType::CONSTANT) {
kernel_.setArg(idx++, this->constant_value_);
}
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
auto data_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides[0]);
kernel_.setArg(idx++, strides[1]);
kernel_.setArg(idx++, kernels[0]);
kernel_.setArg(idx++, kernels[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -57,7 +57,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -57,7 +57,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace pooling } // namespace pooling
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel { class PoolingKernel : public OpenCLPoolingKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel { ...@@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides[0]);
kernel_.setArg(idx++, strides[1]);
kernel_.setArg(idx++, kernels[0]);
kernel_.setArg(idx++, kernels[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/reduce.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ReduceKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
built_options.emplace("-Dreduce=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
// Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
}
lws = {gws[0], gws[1], 1};
const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements.
const int compute_size = (image_size + group_num - 1) / group_num;
const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -24,20 +24,18 @@ ...@@ -24,20 +24,18 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
#include "mace/ops/reduce.h" #include "mace/ops/common/reduce_type.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class ReduceKernel : public OpenCLReduceKernel { class ReduceKernel : public OpenCLReduceKernel {
public: public:
ReduceKernel(ReduceType type, ReduceKernel(ReduceType type,
const std::vector<int> &axis, const std::vector<int> &axis)
const bool keep_dims) : reduce_type_(type), axis_(axis) {}
: reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {}
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
...@@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel { ...@@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel {
private: private:
ReduceType reduce_type_; ReduceType reduce_type_;
const std::vector<int> axis_; const std::vector<int> axis_;
bool keep_dims_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ReduceKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
built_options.emplace("-Dreduce=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
// Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
}
lws = {gws[0], gws[1], 1};
const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements.
const int compute_size = (image_size + group_num - 1) / group_num;
const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_bicubic.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeBicubicKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(
MakeString("-DTABLE_SIZE=", common::utils::kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
common::utils::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -25,13 +25,14 @@ ...@@ -25,13 +25,14 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_bicubic.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
namespace resize_bicubic { namespace resize_bicubic {
constexpr int64_t kTableSize = (1u << 10);
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws, const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
...@@ -60,7 +61,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -60,7 +61,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_bicubic } // namespace resize_bicubic
template <typename T>
class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
public: public:
ResizeBicubicKernel(bool align_corners, ResizeBicubicKernel(bool align_corners,
...@@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { ...@@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ResizeBicubicKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(
MakeString("-DTABLE_SIZE=",
mace::ops::resize_bicubic::kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_bicubic::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
mace::ops::resize_bicubic::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_bilinear.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeBilinearKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_bilinear.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_bilinear } // namespace resize_bilinear
template <typename T>
class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
public: public:
ResizeBilinearKernel(bool align_corners, ResizeBilinearKernel(bool align_corners,
...@@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { ...@@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ResizeBilinearKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_bilinear::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
mace::ops::resize_bilinear::CalculateResizeScale(in_width,
out_width,
align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeNearestNeighborKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *size,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size);
const index_t out_height = size->data<int32_t>()[0];
const index_t out_width = size->data<int32_t>()[1];
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
"resize_nearest_neighbor_nocache");
built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_nearest_neighbor",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
common::utils::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h" #include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_nearest_neighbor.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_nearest_neighbor } // namespace resize_nearest_neighbor
template <typename T>
class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel { class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
public: public:
explicit ResizeNearestNeighborKernel(bool align_corners) explicit ResizeNearestNeighborKernel(bool align_corners)
...@@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel { ...@@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus ResizeNearestNeighborKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *size,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size);
const index_t out_height = size->data<int32_t>()[0];
const index_t out_width = size->data<int32_t>()[1];
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
"resize_nearest_neighbor_nocache");
built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_nearest_neighbor",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_nearest_neighbor::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
mace::ops::resize_nearest_neighbor::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (use_log_)
built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -56,7 +56,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -56,7 +56,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} }
} // namespace softmax } // namespace softmax
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel { class SoftmaxKernel : public OpenCLSoftmaxKernel {
public: public:
explicit SoftmaxKernel(bool use_log) explicit SoftmaxKernel(bool use_log)
...@@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { ...@@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (use_log_)
built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/space_to_batch.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SpaceToBatchKernel::Compute(
OpContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto input_dt = space_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, space_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { ...@@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SpaceToBatchKernel<T>::Compute(
OpContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, space_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/space_to_depth.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SpaceToDepthKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
if (input_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", input_depth));
}
built_options.emplace(kernel_name_ss.str());
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
public: public:
explicit SpaceToDepthKernel(const int block_size) explicit SpaceToDepthKernel(const int block_size)
...@@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { ...@@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SpaceToDepthKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
if (input_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", input_depth));
}
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/split.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SplitKernel::Compute(
OpContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list) {
MACE_UNUSED(axis_);
const index_t input_channels = input->dim(3);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (context->future() != nullptr) {
context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -31,7 +31,6 @@ namespace ops { ...@@ -31,7 +31,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class SplitKernel : public OpenCLSplitKernel { class SplitKernel : public OpenCLSplitKernel {
public: public:
explicit SplitKernel(const int32_t axis) : axis_(axis) {} explicit SplitKernel(const int32_t axis) : axis_(axis) {}
...@@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel { ...@@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel {
uint32_t kwg_size_; uint32_t kwg_size_;
}; };
template <typename T>
MaceStatus SplitKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list) {
const index_t input_channels = input->dim(3);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (context->future() != nullptr) {
context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/sqrdiff_mean.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SqrDiffMeanKernel::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
MACE_CHECK_NOTNULL(input0);
MACE_CHECK_NOTNULL(input1);
MACE_CHECK(input0->dim(0) == input1->dim(0) &&
input0->dim(3) == input1->dim(3));
MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
"SqrDiffMean gpu only support 4-dim input");
index_t batch = input0->dim(0);
const index_t in_height = input0->dim(1);
const index_t in_width = input0->dim(2);
const index_t channels = input0->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float img_size_reciprocal = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
...@@ -30,7 +30,6 @@ namespace ops { ...@@ -30,7 +30,6 @@ namespace ops {
namespace opencl { namespace opencl {
namespace image { namespace image {
template <typename T>
class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
public: public:
MaceStatus Compute( MaceStatus Compute(
...@@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { ...@@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
template <typename T>
MaceStatus SqrDiffMeanKernel<T>::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
MACE_CHECK_NOTNULL(input0);
MACE_CHECK_NOTNULL(input1);
MACE_CHECK(input0->dim(0) == input1->dim(0) &&
input0->dim(3) == input1->dim(3));
MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
"SqrDiffMean gpu only support 4-dim input");
index_t batch = input0->dim(0);
const index_t in_height = input0->dim(1);
const index_t in_width = input0->dim(2);
const index_t channels = input0->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float img_size_reciprocal = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image } // namespace image
} // namespace opencl } // namespace opencl
} // namespace ops } // namespace ops
......
...@@ -29,7 +29,6 @@ namespace { ...@@ -29,7 +29,6 @@ namespace {
MaceStatus WinogradInputTransform(OpContext *context, MaceStatus WinogradInputTransform(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input_tensor, const Tensor *input_tensor,
const DataType dt,
const int *paddings, const int *paddings,
const index_t round_h, const index_t round_h,
const index_t round_w, const index_t round_w,
...@@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context, ...@@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context,
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, built_options,
...@@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context, ...@@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context,
kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
} }
const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0}; const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel", std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0), output_tensor->dim(0),
...@@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context, ...@@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input_tensor, const Tensor *input_tensor,
const Tensor *bias, const Tensor *bias,
const DataType dt,
const index_t round_h, const index_t round_h,
const index_t round_w, const index_t round_w,
const int wino_blk_size, const int wino_blk_size,
...@@ -145,33 +142,41 @@ MaceStatus WinogradOutputTransform(OpContext *context, ...@@ -145,33 +142,41 @@ MaceStatus WinogradOutputTransform(OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP: {
break; break;
case RELU: }
case RELU: {
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: }
case RELUX: {
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU: }
case PRELU: {
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH: }
case TANH: {
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: }
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
case LEAKYRELU: }
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU"); built_options.emplace("-DUSE_LEAKYRELU");
break; break;
default: }
default: {
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name, obfuscated_kernel_name,
...@@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const float leakyrelu_coefficient, const float leakyrelu_coefficient,
const DataType dt,
const int wino_blk_size, const int wino_blk_size,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
...@@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
OpenCLBufferType::IN_OUT_HEIGHT, OpenCLBufferType::IN_OUT_HEIGHT,
&t_input_image_shape); &t_input_image_shape);
ScratchImage transformed_input_image(scratch_manager); ScratchImage transformed_input_image(scratch_manager);
std::unique_ptr<Tensor> transformed_input = make_unique<Tensor>( auto input_dt = input->dtype();
transformed_input_image.Scratch(context->device()->allocator(), auto image = transformed_input_image.Scratch(context->device()->allocator(),
t_input_image_shape, dt), dt); t_input_image_shape, input_dt);
auto transformed_input = make_unique<Tensor>(image, input_dt);
MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape, MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
t_input_image_shape)); t_input_image_shape));
MACE_RETURN_IF_ERROR(WinogradInputTransform( MACE_RETURN_IF_ERROR(WinogradInputTransform(
context, kernels[0], input, dt, paddings, context, kernels[0], input, paddings,
round_h, round_w, wino_blk_size, round_h, round_w, wino_blk_size,
input_changed, transformed_input.get(), input_changed, transformed_input.get(),
kwg_size[0], &t_input_future)); kwg_size[0], &t_input_future));
...@@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
&mm_output_image_shape); &mm_output_image_shape);
ScratchImage mm_output_image(scratch_manager); ScratchImage mm_output_image(scratch_manager);
auto output_dt = input->dtype();
std::unique_ptr<Tensor> mm_output = make_unique<Tensor>( std::unique_ptr<Tensor> mm_output = make_unique<Tensor>(
mm_output_image.Scratch(context->device()->allocator(), mm_output_image.Scratch(context->device()->allocator(),
mm_output_image_shape, dt), dt); mm_output_image_shape, output_dt), output_dt);
MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape, MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
mm_output_image_shape)); mm_output_image_shape));
...@@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, kernels[1])); built_options, kernels[1]));
...@@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, ...@@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
// t_output (blk_sqr, out_chan, out_width) -> output(NHWC) // t_output (blk_sqr, out_chan, out_width) -> output(NHWC)
MACE_RETURN_IF_ERROR(WinogradOutputTransform( MACE_RETURN_IF_ERROR(WinogradOutputTransform(
context, kernels[2], mm_output.get(), bias, context, kernels[2], mm_output.get(), bias,
dt, round_h, round_w, wino_blk_size, activation, relux_max_limit, round_h, round_w, wino_blk_size, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, kwg_size[2], leakyrelu_coefficient, input_changed, output, kwg_size[2],
&t_output_future)) &t_output_future))
......
...@@ -25,21 +25,20 @@ ...@@ -25,21 +25,20 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class LSTMCellOp; class LSTMCellOp;
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class LSTMCellOp<DeviceType::GPU, T> : public Operation { class LSTMCellOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit LSTMCellOp(OpConstructContext *context) explicit LSTMCellOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
T forget_bias = static_cast<T>( float forget_bias = Operation::GetOptionalArg<float>("scalar_input",
Operation::GetOptionalArg<float>("scalar_input", 0.0);
0.0));
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias); kernel_ = make_unique<opencl::image::LSTMCellKernel>(forget_bias);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -47,30 +46,26 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation { ...@@ -47,30 +46,26 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
const Tensor *pre_output = context->workspace()->GetTensor( const Tensor *pre_output = context->workspace()->GetTensor(
operator_def_->input(1)); operator_def_->input(1));
if (pre_output->is_weight()) { if (pre_output->is_weight()) {
MACE_CHECK(TransformFilter<T>(context, auto status = TransformFilter(context, operator_def_.get(),
operator_def_.get(), 1, OpenCLBufferType::IN_OUT_CHANNEL,
1, mem_type);
OpenCLBufferType::IN_OUT_CHANNEL, MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
mem_type) == MaceStatus::MACE_SUCCESS);
} }
MACE_CHECK(TransformFilter<T>(context, auto status = TransformFilter(context, operator_def_.get(),
operator_def_.get(), 2, OpenCLBufferType::IN_OUT_CHANNEL,
2, mem_type);
OpenCLBufferType::IN_OUT_CHANNEL, MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
mem_type) == MaceStatus::MACE_SUCCESS); status = TransformFilter(context, operator_def_.get(),
MACE_CHECK(TransformFilter<T>(context, 3, OpenCLBufferType::ARGUMENT,
operator_def_.get(), mem_type);
3, MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
OpenCLBufferType::ARGUMENT, const Tensor *pre_cell =
mem_type) == MaceStatus::MACE_SUCCESS); context->workspace()->GetTensor(operator_def_->input(4));
const Tensor *pre_cell = context->workspace()->GetTensor(
operator_def_->input(4));
if (pre_cell->is_weight()) { if (pre_cell->is_weight()) {
MACE_CHECK(TransformFilter<T>(context, status = TransformFilter(context, operator_def_.get(),
operator_def_.get(), 4, OpenCLBufferType::IN_OUT_CHANNEL,
4, mem_type);
OpenCLBufferType::IN_OUT_CHANNEL, MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
mem_type) == MaceStatus::MACE_SUCCESS);
} }
} }
...@@ -92,14 +87,10 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation { ...@@ -92,14 +87,10 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL); MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
MACE_OP_OUTPUT_TAGS(CELL, OUTPUT); MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
}; };
#endif #endif // MACE_ENABLE_OPENCL
void RegisterLSTMCell(OpRegistryBase *op_registry) { void RegisterLSTMCell(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp);
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
DeviceType::GPU, half);
} }
} // namespace ops } // namespace ops
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <vector> #include <vector>
#include "mace/ops/pooling.h" #include "mace/ops/common/pooling_type.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
namespace mace { namespace mace {
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <memory> #include <memory>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/pad.h" #include "mace/ops/common/pad_type.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/pad.h" #include "mace/ops/opencl/image/pad.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -26,10 +26,10 @@ ...@@ -26,10 +26,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template<DeviceType D, typename T>
class PadOp; class PadOp;
template <typename T> template<typename T>
class PadOp<DeviceType::CPU, T> : public Operation { class PadOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit PadOp(OpConstructContext *context) explicit PadOp(OpConstructContext *context)
...@@ -116,10 +116,10 @@ class PadOp<DeviceType::CPU, T> : public Operation { ...@@ -116,10 +116,10 @@ class PadOp<DeviceType::CPU, T> : public Operation {
for (index_t c = 0; c < o_channel; ++c) { for (index_t c = 0; c < o_channel; ++c) {
index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add); index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add);
const index_t in_offset = (((b_in * channel + c_in) * height) + const index_t in_offset =
h_in) * width; (((b_in * channel + c_in) * height) + h_in) * width;
index_t out_offset = (((b * o_channel + c) * o_height) + index_t out_offset =
h) * o_width; (((b * o_channel + c) * o_height) + h) * o_width;
for (index_t i = 0, j = paddings_[6] + l_add; for (index_t i = 0, j = paddings_[6] + l_add;
i < paddings_[6]; ++i, --j) { i < paddings_[6]; ++i, --j) {
...@@ -169,8 +169,8 @@ class PadOp<DeviceType::CPU, T> : public Operation { ...@@ -169,8 +169,8 @@ class PadOp<DeviceType::CPU, T> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class PadOp<DeviceType::GPU, T> : public Operation { class PadOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit PadOp(OpConstructContext *context) explicit PadOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -180,7 +180,7 @@ class PadOp<DeviceType::GPU, T> : public Operation { ...@@ -180,7 +180,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
float constant_value = Operation::GetOptionalArg<float>( float constant_value = Operation::GetOptionalArg<float>(
"constant_value", 0.0); "constant_value", 0.0);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PadKernel<T>>( kernel_ = make_unique<opencl::image::PadKernel>(
type, paddings, constant_value); type, paddings, constant_value);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -198,18 +198,11 @@ class PadOp<DeviceType::GPU, T> : public Operation { ...@@ -198,18 +198,11 @@ class PadOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterPad(OpRegistryBase *op_registry) { void RegisterPad(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Pad", PadOp, MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include "mace/ops/pooling.h"
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
...@@ -28,6 +26,7 @@ ...@@ -28,6 +26,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/conv_pool_2d_base.h" #include "mace/ops/conv_pool_2d_base.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/common/pooling_type.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/pooling.h" #include "mace/ops/opencl/image/pooling.h"
#include "mace/ops/opencl/buffer/pooling.h" #include "mace/ops/opencl/buffer/pooling.h"
...@@ -486,15 +485,15 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase { ...@@ -486,15 +485,15 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase { class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
public: public:
explicit PoolingOp(OpConstructContext *context) explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) { : PoolingOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PoolingKernel<T>>(); kernel_ = make_unique<opencl::image::PoolingKernel>();
} else { } else {
kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>(); kernel_ = make_unique<opencl::buffer::PoolingKernel>();
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) { ...@@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Pooling", PoolingOp);
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -12,13 +12,12 @@ ...@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/reduce.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <set> #include <set>
#include <vector> #include <vector>
#include "mace/ops/common/reduce_type.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/runtime/cpu/cpu_runtime.h"
...@@ -868,15 +867,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims( ...@@ -868,15 +867,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase { class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
public: public:
explicit ReduceOp(OpConstructContext *context) explicit ReduceOp(OpConstructContext *context)
: ReduceOpBase(context) { : ReduceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_, kernel_ = make_unique<opencl::image::ReduceKernel>(reduce_type_,
axis_, axis_);
keep_dims_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) { ...@@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Reduce", ReduceOp);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Reduce") OpConditionBuilder("Reduce")
...@@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) { ...@@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
bool keep_dims = bool keep_dims =
ProtoArgHelper::GetOptionalArg<OperatorDef, bool>( ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
*op, "keepdims", false); *op, "keepdims", false);
if (!keep_dims) { if (!keep_dims) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
auto axis = auto axis =
ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>( ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
*op, "axis"); *op, "axis");
if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) { if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
auto tensor_shape_info = context->tensor_shape_info(); auto tensor_shape_info = context->tensor_shape_info();
if (tensor_shape_info->count(op->input(0)) == 0 if (tensor_shape_info->count(op->input(0)) == 0
|| tensor_shape_info->at(op->input(0)).size() != 4) { || tensor_shape_info->at(op->input(0)).size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -12,14 +12,13 @@ ...@@ -12,14 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/resize_bicubic.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_bicubic.h" #include "mace/ops/opencl/image/resize_bicubic.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -33,12 +32,12 @@ inline const std::shared_ptr<float> InitCoeffsTable() { ...@@ -33,12 +32,12 @@ inline const std::shared_ptr<float> InitCoeffsTable() {
// convolution algorithm. // convolution algorithm.
// https://en.wikipedia.org/wiki/Bicubic_interpolation // https://en.wikipedia.org/wiki/Bicubic_interpolation
auto coeffs_tab = std::shared_ptr<float>( auto coeffs_tab = std::shared_ptr<float>(
new float[(resize_bicubic::kTableSize + 1) * 2], new float[(common::utils::kTableSize + 1) * 2],
std::default_delete<float[]>()); std::default_delete<float[]>());
float *coeffs_tab_ptr = coeffs_tab.get(); float *coeffs_tab_ptr = coeffs_tab.get();
static const float A = -0.75f; static const float A = -0.75f;
for (int i = 0; i <= resize_bicubic::kTableSize; ++i) { for (int i = 0; i <= common::utils::kTableSize; ++i) {
float x = i * 1.0f / resize_bicubic::kTableSize; float x = i * 1.0f / common::utils::kTableSize;
coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1; coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
x += 1.0; x += 1.0;
coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
...@@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit, ...@@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
std::vector<int64_t> *indices) { std::vector<int64_t> *indices) {
auto in_loc = static_cast<int64_t>(scale * out_loc); auto in_loc = static_cast<int64_t>(scale * out_loc);
const float delta = scale * out_loc - in_loc; const float delta = scale * out_loc - in_loc;
const int64_t offset = lrintf(delta * resize_bicubic::kTableSize); const int64_t offset = lrintf(delta * common::utils::kTableSize);
const float *coeffs_tab = GetCoeffsTable(); const float *coeffs_tab = GetCoeffsTable();
*weights = {coeffs_tab[offset * 2 + 1], *weights = {coeffs_tab[offset * 2 + 1],
coeffs_tab[offset * 2], coeffs_tab[offset * 2],
coeffs_tab[(resize_bicubic::kTableSize - offset) * 2], coeffs_tab[(common::utils::kTableSize - offset) * 2],
coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]}; coeffs_tab[(common::utils::kTableSize - offset) * 2 + 1]};
*indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit), *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}; Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
} }
...@@ -173,11 +172,11 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation { ...@@ -173,11 +172,11 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
} }
float height_scale = float height_scale =
resize_bicubic::CalculateResizeScale(in_height, common::utils::CalculateResizeScale(in_height,
out_height, out_height,
align_corners_); align_corners_);
float width_scale = float width_scale =
resize_bicubic::CalculateResizeScale(in_width, common::utils::CalculateResizeScale(in_width,
out_width, out_width,
align_corners_); align_corners_);
...@@ -202,8 +201,8 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation { ...@@ -202,8 +201,8 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ResizeBicubicOp<DeviceType::GPU, T> : public Operation { class ResizeBicubicOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ResizeBicubicOp(OpConstructContext *context) explicit ResizeBicubicOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -213,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation { ...@@ -213,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
"size", {-1, -1}); "size", {-1, -1});
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>( kernel_ = make_unique<opencl::image::ResizeBicubicKernel>(
align_corners, size[0], size[1]); align_corners, size[0], size[1]);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) { ...@@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "ResizeBicubic", ResizeBicubicOp);
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/resize_bilinear.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <vector> #include <vector>
...@@ -21,6 +19,7 @@ ...@@ -21,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
#include "mace/core/quantize.h" #include "mace/core/quantize.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_bilinear.h" #include "mace/ops/opencl/image/resize_bilinear.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -223,11 +222,11 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation { ...@@ -223,11 +222,11 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
} }
float height_scale = float height_scale =
resize_bilinear::CalculateResizeScale(in_height, common::utils::CalculateResizeScale(in_height,
out_height, out_height,
align_corners_); align_corners_);
float width_scale = float width_scale =
resize_bilinear::CalculateResizeScale(in_width, common::utils::CalculateResizeScale(in_width,
out_width, out_width,
align_corners_); align_corners_);
...@@ -299,11 +298,11 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -299,11 +298,11 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
} }
float height_scale = float height_scale =
resize_bilinear::CalculateResizeScale(in_height, common::utils::CalculateResizeScale(in_height,
out_height, out_height,
align_corners_); align_corners_);
float width_scale = float width_scale =
resize_bilinear::CalculateResizeScale(in_width, common::utils::CalculateResizeScale(in_width,
out_width, out_width,
align_corners_); align_corners_);
...@@ -336,8 +335,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -336,8 +335,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ResizeBilinearOp<DeviceType::GPU, T> : public Operation { class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ResizeBilinearOp(OpConstructContext *context) explicit ResizeBilinearOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
...@@ -347,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation { ...@@ -347,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
"size", {-1, -1}); "size", {-1, -1});
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>( kernel_ = make_unique<opencl::image::ResizeBilinearKernel>(
align_corners, size[0], size[1]); align_corners, size[0], size[1]);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) { ...@@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "ResizeBilinear", ResizeBilinearOp);
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -12,13 +12,12 @@ ...@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/resize_nearest_neighbor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_nearest_neighbor.h" #include "mace/ops/opencl/image/resize_nearest_neighbor.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -115,11 +114,11 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation { ...@@ -115,11 +114,11 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
} }
float height_scale = float height_scale =
resize_nearest_neighbor::CalculateResizeScale(in_height, common::utils::CalculateResizeScale(in_height,
out_height, out_height,
align_corners_); align_corners_);
float width_scale = float width_scale =
resize_nearest_neighbor::CalculateResizeScale(in_width, common::utils::CalculateResizeScale(in_width,
out_width, out_width,
align_corners_); align_corners_);
ResizeImageNCHW(context, ResizeImageNCHW(context,
...@@ -142,15 +141,15 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation { ...@@ -142,15 +141,15 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation { class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ResizeNearestNeighborOp(OpConstructContext *context) explicit ResizeNearestNeighborOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
bool align_corners = Operation::GetOptionalArg<bool>( bool align_corners = Operation::GetOptionalArg<bool>(
"align_corners", false); "align_corners", false);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>( kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel>(
align_corners); align_corners);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) { ...@@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU, float); ResizeNearestNeighborOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", ResizeNearestNeighborOp);
ResizeNearestNeighborOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
namespace resize_nearest_neighbor {
inline float CalculateResizeScale(index_t in_size,
index_t out_size,
bool align_corners) {
return (align_corners && out_size > 1)
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_nearest_neighbor
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
...@@ -35,10 +35,10 @@ ...@@ -35,10 +35,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template<DeviceType D, typename T>
class SoftmaxOp; class SoftmaxOp;
template <> template<>
class SoftmaxOp<DeviceType::CPU, float> : public Operation { class SoftmaxOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
...@@ -407,17 +407,17 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -407,17 +407,17 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class SoftmaxOp<DeviceType::GPU, T> : public Operation { class SoftmaxOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
bool use_log = ( bool use_log = (
Operation::GetOptionalArg<bool>("use_log", false)); Operation::GetOptionalArg<bool>("use_log", false));
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log); kernel_ = make_unique<opencl::image::SoftmaxKernel>(use_log);
} else { } else {
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log); kernel_ = make_unique<opencl::buffer::SoftmaxKernel>(use_log);
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -433,7 +433,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation { ...@@ -433,7 +433,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterSoftmax(OpRegistryBase *op_registry) { void RegisterSoftmax(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU, float); DeviceType::CPU, float);
...@@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { ...@@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Softmax", SoftmaxOp);
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
...@@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { ...@@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
if (op->output_shape(0).dims_size() != 2 && if (op->output_shape(0).dims_size() != 2 &&
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation { ...@@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation {
} }
}; };
template <DeviceType D, class T> template<DeviceType D, class T>
class SpaceToBatchNDOp; class SpaceToBatchNDOp;
template <> template<>
class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
public: public:
explicit SpaceToBatchNDOp(OpConstructContext *context) explicit SpaceToBatchNDOp(OpConstructContext *context)
...@@ -302,13 +302,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase { ...@@ -302,13 +302,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase { class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
public: public:
explicit SpaceToBatchNDOp(OpConstructContext *context) explicit SpaceToBatchNDOp(OpConstructContext *context)
: SpaceToBatchOpBase(context) { : SpaceToBatchOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>(); kernel_ = make_unique<opencl::image::SpaceToBatchKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) { ...@@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
SpaceToBatchNDOp, DeviceType::CPU, uint8_t); SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp);
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, class T> template<DeviceType D, class T>
class SpaceToDepthOp : public Operation { class SpaceToDepthOp : public Operation {
public: public:
explicit SpaceToDepthOp(OpConstructContext *context) explicit SpaceToDepthOp(OpConstructContext *context)
...@@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation { ...@@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class SpaceToDepthOp<DeviceType::GPU, T> : public Operation { class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit SpaceToDepthOp(OpConstructContext *context) explicit SpaceToDepthOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1); int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size); kernel_ = make_unique<opencl::image::SpaceToDepthKernel>(block_size);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) { ...@@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToDepth", MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU, float); SpaceToDepthOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -100,14 +100,14 @@ class SplitOp<DeviceType::CPU, T> : public Operation { ...@@ -100,14 +100,14 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class SplitOp<DeviceType::GPU, T> : public Operation { class SplitOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit SplitOp(OpConstructContext *context) explicit SplitOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
int32_t axis = Operation::GetOptionalArg<int>("axis", 3); int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis); kernel_ = make_unique<opencl::image::SplitKernel>(axis);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) { ...@@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Split", SplitOp, MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template<DeviceType D, typename T>
class SqrDiffMeanOp : public Operation { class SqrDiffMeanOp : public Operation {
public: public:
explicit SqrDiffMeanOp(OpConstructContext *context) explicit SqrDiffMeanOp(OpConstructContext *context)
...@@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation { ...@@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation {
} }
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template<>
class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation { class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit SqrDiffMeanOp(OpConstructContext *context) explicit SqrDiffMeanOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>(); kernel_ = make_unique<opencl::image::SqrDiffMeanKernel>();
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -101,18 +100,11 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation { ...@@ -101,18 +100,11 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterSqrDiffMean(OpRegistryBase *op_registry) { void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU, float); DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -20,18 +20,21 @@ ...@@ -20,18 +20,21 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> class SqueezeOpRaw : public Operation {
class SqueezeOp : public Operation {
public: public:
explicit SqueezeOp(OpConstructContext *context) explicit SqueezeOpRaw(OpConstructContext *context,
DeviceType device_type,
DataType data_type)
: Operation(context), : Operation(context),
axis_(Operation::GetRepeatedArgs<int>("axis", {})), axis_(Operation::GetRepeatedArgs<int>("axis", {})),
checked_(false) {} checked_(false),
data_type_(data_type),
device_type_(device_type) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
if (!checked_ && D == DeviceType::CPU if (!checked_ && device_type_ == DeviceType::CPU
&& DataTypeToEnum<T>::value != DT_UINT8) { && data_type_ != DT_UINT8) {
auto has_df = Operation::GetOptionalArg<int>( auto has_df = Operation::GetOptionalArg<int>(
"has_data_format", 0); "has_data_format", 0);
if (has_df && this->Input(0)->dim_size() == 4) { if (has_df && this->Input(0)->dim_size() == 4) {
...@@ -62,6 +65,16 @@ class SqueezeOp : public Operation { ...@@ -62,6 +65,16 @@ class SqueezeOp : public Operation {
private: private:
std::vector<int> axis_; std::vector<int> axis_;
bool checked_; bool checked_;
DataType data_type_;
DeviceType device_type_;
};
template<DeviceType D, typename T>
class SqueezeOp : public SqueezeOpRaw {
public:
explicit SqueezeOp(OpConstructContext *context)
: SqueezeOpRaw(context, D, DataTypeToEnum<T>::value) {
}
}; };
void RegisterSqueeze(OpRegistryBase *op_registry) { void RegisterSqueeze(OpRegistryBase *op_registry) {
...@@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { ...@@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL MACE_REGISTER_GPU_OP(op_registry, "Squeeze", SqueezeOp);
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
OpConditionBuilder("Squeeze") OpConditionBuilder("Squeeze")
...@@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { ...@@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
} }
if (op->output_shape(0).dims_size() != 2 && if (op->output_shape(0).dims_size() != 2 &&
op->output_shape(0).dims_size() != 4) { op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return {DeviceType::CPU};
} }
return { DeviceType::CPU, DeviceType::GPU }; return {DeviceType::CPU, DeviceType::GPU};
})); }));
} }
......
...@@ -37,55 +37,73 @@ def encrypt_code(code_str): ...@@ -37,55 +37,73 @@ def encrypt_code(code_str):
return encrypted_arr return encrypted_arr
def create_output_dir(dir_path):
if os.path.exists(dir_path):
if os.path.isdir(dir_path):
try:
shutil.rmtree(dir_path)
except OSError:
raise RuntimeError(
"Cannot delete directory %s due to permission "
"error, inspect and remove manually" % dir_path)
else:
raise RuntimeError(
"Cannot delete non-directory %s, inspect ",
"and remove manually" % dir_path)
os.makedirs(dir_path)
def write_cl_encrypted_kernel_to_file(
encrypted_code_maps, template_path, output_path):
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cl_encrypted_kernel = env.get_template(template_path).render(
tag='codegen',
maps=encrypted_code_maps,
data_type='unsigned char',
variable_name='kEncryptedProgramMap')
with open(output_path, "w") as w_file:
w_file.write(cl_encrypted_kernel)
def get_module_key(file_name):
module_key = None
if file_name[-3:] == ".cl":
module_key = file_name[:-3]
elif file_name[-2:] == ".h":
module_key = file_name
return module_key
def encrypt_opencl_codegen(cl_kernel_dir, output_path): def encrypt_opencl_codegen(cl_kernel_dir, output_path):
if not os.path.exists(cl_kernel_dir): if not os.path.exists(cl_kernel_dir):
print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!") print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!")
header_code = ""
for file_name in os.listdir(cl_kernel_dir):
file_path = os.path.join(cl_kernel_dir, file_name)
if file_path[-2:] == ".h":
with open(file_path, "r") as f:
header_code += f.read()
encrypted_code_maps = {} encrypted_code_maps = {}
for file_name in os.listdir(cl_kernel_dir): for file_name in os.listdir(cl_kernel_dir):
file_path = os.path.join(cl_kernel_dir, file_name) file_path = os.path.join(cl_kernel_dir, file_name)
if file_path[-3:] == ".cl": module_key = get_module_key(file_name)
if len(module_key) > 0:
with open(file_path, "r") as f: with open(file_path, "r") as f:
code_str = "" code_str = ""
headers = []
for line in f.readlines(): for line in f.readlines():
if "#include <common.h>" in line: if "#include <common.h>" in line:
code_str += header_code headers.append(get_module_key("common.h"))
else: else:
code_str += line code_str += line
encrypted_code_arr = encrypt_code(code_str) encrypted_code_arr = encrypt_code(code_str)
encrypted_code_maps[file_name[:-3]] = encrypted_code_arr encrypted_code = {}
encrypted_code['headers'] = headers
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) encrypted_code['code'] = encrypted_code_arr
cpp_cl_encrypted_kernel = env.get_template( encrypted_code_maps[module_key] = encrypted_code
'str2vec_maps.cc.jinja2').render(
maps=encrypted_code_maps, create_output_dir(os.path.dirname(output_path))
data_type='unsigned char', write_cl_encrypted_kernel_to_file(
variable_name='kEncryptedProgramMap') encrypted_code_maps, 'str2vec_maps.cc.jinja2', output_path)
output_path_h = output_path.replace('.cc', '.h')
output_dir = os.path.dirname(output_path) write_cl_encrypted_kernel_to_file(
if os.path.exists(output_dir): encrypted_code_maps, 'str2vec_maps.h.jinja2', output_path_h)
if os.path.isdir(output_dir):
try:
shutil.rmtree(output_dir)
except OSError:
raise RuntimeError(
"Cannot delete directory %s due to permission "
"error, inspect and remove manually" % output_dir)
else:
raise RuntimeError(
"Cannot delete non-directory %s, inspect ",
"and remove manually" % output_dir)
os.makedirs(output_dir)
with open(output_path, "w") as w_file:
w_file.write(cpp_cl_encrypted_kernel)
print('Generate OpenCL kernel done.') print('Generate OpenCL kernel done.')
......
...@@ -14,24 +14,32 @@ ...@@ -14,24 +14,32 @@
// This is a generated file. DO NOT EDIT! // This is a generated file. DO NOT EDIT!
#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include <map> #include <map>
#include <string> #include <string>
#include <vector>
namespace mace { namespace mace {
namespace {{tag}} {
extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} = const std::map<std::string, ClProgramInfo> {{variable_name}} = {
{ {% for key, encrypted_code in maps.items() %}
{% for key, value in maps.items() %} {
"{{key}}", {
{ {
"{{key}}", {%- for header in encrypted_code['headers'] -%}
"{{header}}",
{%- endfor -%}
},
{ {
{%- for ele in value -%} {%- for ele in encrypted_code['code'] -%}
{{ele}}, {{ele}},
{%- endfor -%} {%- endfor -%}
} }
}
}, // {{key}} }, // {{key}}
{% endfor %} {% endfor %}
}; };
} // {{tag}}
} // namespace mace } // namespace mace
...@@ -12,23 +12,21 @@ ...@@ -12,23 +12,21 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_RESIZE_BILINEAR_H_ // This is a generated file. DO NOT EDIT!
#define MACE_OPS_RESIZE_BILINEAR_H_
#include "mace/core/types.h" #include <map>
#include <string>
#include <vector>
namespace mace { namespace mace {
namespace ops { namespace {{tag}} {
namespace resize_bilinear {
inline float CalculateResizeScale(index_t in_size, struct ClProgramInfo {
index_t out_size, const std::vector<std::string> headers_;
bool align_corners) { const std::vector<{{data_type}}> encrypted_code_;
return (align_corners && out_size > 1) };
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_bilinear
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_BILINEAR_H_ extern const std::map<std::string, ClProgramInfo> {{variable_name}};
} // {{tag}}
} // namespace mace
...@@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): ...@@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
unused_var = repository_ctx.path(Label("//:.git/refs/heads/master")) unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
ret = repository_ctx.execute( ret = repository_ctx.execute(
["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path], ["test", "-f", "%s/mace/ops/opencl/cl/common.cl" % mace_root_path],
) )
if ret.return_code == 0: if ret.return_code == 0:
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
...@@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): ...@@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
python_bin_path, python_bin_path,
"%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path, "%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path,
"--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path, "--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path,
"--output_path=%s/encrypt_opencl_kernel" % generated_files_path, "--output_path=%s/encrypt_opencl_kernel.cc" % generated_files_path,
], quiet = False) ], quiet = False)
encrypt_opencl_kernel_repository = repository_rule( encrypt_opencl_kernel_repository = repository_rule(
......
...@@ -42,7 +42,7 @@ void FilterBufferToImage(int iters, ...@@ -42,7 +42,7 @@ void FilterBufferToImage(int iters,
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
auto transform_func = [&]() { auto transform_func = [&]() {
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, .Transform(&context,
net.ws()->GetTensor("Input"), net.ws()->GetTensor("Input"),
OpenCLBufferType::IN_OUT_CHANNEL, OpenCLBufferType::IN_OUT_CHANNEL,
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#include "mace/benchmark_utils/test_benchmark.h" #include "mace/benchmark_utils/test_benchmark.h"
#include "mace/ops/common/pad_type.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/pad.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "mace/benchmark_utils/test_benchmark.h" #include "mace/benchmark_utils/test_benchmark.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/pooling.h" #include "mace/ops/common/pooling_type.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
......
...@@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type, ...@@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
Tensor *b2i_output = net.ws()->CreateTensor( Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"), .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output); type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform // Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor( Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output, .Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output); type, MemoryType::GPU_BUFFER, 0, i2b_output);
...@@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type, ...@@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
Tensor *b2i_output = net.ws()->CreateTensor( Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"), .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output); type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform // Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor( Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT); "I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output, .Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output); type, MemoryType::GPU_BUFFER, 0, i2b_output);
...@@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type, ...@@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Transform // Transform
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"), .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output); type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform // Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor( Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value); "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output, .Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output); type, MemoryType::GPU_BUFFER, 0, i2b_output);
......
...@@ -45,7 +45,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, ...@@ -45,7 +45,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
"BtOutput", context.device()->allocator(), "BtOutput", context.device()->allocator(),
DataTypeToEnum<DstType>::value); DataTypeToEnum<DstType>::value);
OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER, OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER) MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"), .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_BUFFER, 0, bt_output); type, MemoryType::GPU_BUFFER, 0, bt_output);
...@@ -54,7 +54,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, ...@@ -54,7 +54,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
Tensor *output = net.ws()->CreateTensor( Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(), "Output", context.device()->allocator(),
DataTypeToEnum<OrgType>::value); DataTypeToEnum<OrgType>::value);
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER, OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER) MemoryType::GPU_BUFFER)
.Transform(&context, bt_output, .Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, output); type, MemoryType::GPU_BUFFER, 0, output);
...@@ -90,7 +90,7 @@ void TestArgumentTransform(const index_t input_size) { ...@@ -90,7 +90,7 @@ void TestArgumentTransform(const index_t input_size) {
Tensor *output = net.ws()->CreateTensor( Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(), "Output", context.device()->allocator(),
DataTypeToEnum<T>::value); DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER) MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"), .Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER, OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
......
...@@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context, ...@@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
DtToCLCMDDt(DataTypeToEnum<float>::value)); DtToCLCMDDt(DataTypeToEnum<float>::value));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<float>::value)); DtToCLDt(DataTypeToEnum<float>::value));
built_options.emplace( built_options.emplace(
"-DCMD_DATA_TYPE=" + "-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value)); DtToCLCMDDt(DataTypeToEnum<float>::value));
} }
cl::Kernel kernel; cl::Kernel kernel;
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/ops/common/pad_type.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/pad.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include <vector> #include <vector>
#include "mace/ops/pooling.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/common/pooling_type.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include <vector> #include <vector>
#include "mace/ops/reduce.h" #include "mace/ops/common/reduce_type.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册