提交 85cef1d8 编写于 作者: L luxuhui

adjust opencl code to minify the libmace.so's size

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 23d985f7
......@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
enable_language(CUDA)
endif(MACE_ENABLE_CUDA)
if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(ANDROID_ABI STREQUAL "arm64-v8a")
# Use gold linker to avoid linking check of libcdsprpc.so
set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
......
......@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
}
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MyCustomOp<DeviceType::GPU, T> : public Operation {
template<>
class MyCustomOp<DeviceType::GPU, float> : public Operation {
...
};
#endif // MACE_ENABLE_OPENCL
......@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
}
} // namespace ops
......
......@@ -5,7 +5,7 @@ package(
default_visibility = ["//visibility:public"],
)
load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
cc_library(
name = "generated_models",
......@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
cc_library(
name = "generated_opencl",
srcs = ["opencl/encrypt_opencl_kernel.cc"],
hdrs = ["opencl/encrypt_opencl_kernel.h"],
copts = [
"-Werror",
"-Wextra",
......
......@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
std::string key = OpKeyBuilder(op_type)
.Device(device_type)
.TypeConstraint("T", dtype)
.TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
.Build();
if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key;
......
......@@ -39,7 +39,7 @@ class OpConditionContext {
OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default;
void set_operator_def(const OperatorDef* operator_def);
void set_operator_def(const OperatorDef *operator_def);
inline const OperatorDef *operator_def() const {
return operator_def_;
......@@ -49,7 +49,7 @@ class OpConditionContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -110,7 +110,7 @@ class OpConstructContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -166,14 +166,14 @@ class Operation {
explicit Operation(OpConstructContext *context);
virtual ~Operation() = default;
template <typename T>
template<typename T>
inline T GetOptionalArg(const std::string &name,
const T &default_value) const {
MACE_CHECK(operator_def_, "operator_def was null!");
return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
*operator_def_, name, default_value);
}
template <typename T>
template<typename T>
inline std::vector<T> GetRepeatedArgs(
const std::string &name, const std::vector<T> &default_value = {}) const {
MACE_CHECK(operator_def_, "operator_def was null!");
......@@ -240,7 +240,6 @@ class Operation {
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
struct OpRegistrationInfo {
public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
......@@ -290,7 +289,6 @@ class OpConditionBuilder {
OpRegistrationInfo::DataFormatSelector data_format_selector_;
};
class OpRegistryBase {
public:
OpRegistryBase() = default;
......@@ -315,7 +313,7 @@ class OpRegistryBase {
OpConstructContext *context,
DeviceType device_type) const;
template <class DerivedType>
template<class DerivedType>
static std::unique_ptr<Operation> DefaultCreator(
OpConstructContext *context) {
return std::unique_ptr<Operation>(new DerivedType(context));
......@@ -334,6 +332,24 @@ class OpRegistryBase {
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name<device, dt>>)
#define MACE_REGISTER_OP_BY_CLASS( \
op_registry, op_type, class_name, device, dt) \
op_registry->Register(op_type, \
device, \
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name>)
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
op_type, \
DeviceType::GPU, \
DT_FLOAT, \
OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
#else
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
#endif
#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
op_registry->Register(builder)
......
......@@ -18,20 +18,19 @@
#include <fstream>
#include <memory>
#include <mutex> // NOLINT(build/c++11)
#include <sstream>
#include <string>
#include <vector>
#include <utility>
#include "mace/utils/macros.h"
#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include "mace/core/kv_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/utils/macros.h"
#include "mace/utils/tuner.h"
namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) {
switch (error) {
case CL_SUCCESS:
......@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint,
std::shared_ptr<KVStorage> precompiled_binary_storage,
std::shared_ptr<Tuner<uint32_t>> tuner):
std::shared_ptr<Tuner<uint32_t>> tuner) :
cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner),
......@@ -332,7 +331,7 @@ OpenCLRuntime::OpenCLRuntime(
cl_int err;
if (gpu_type_ == GPUType::QUALCOMM_ADRENO
&& opencl_version_ == OpenCLVersion::CL_VER_2_0) {
&& opencl_version_ == OpenCLVersion::CL_VER_2_0) {
std::vector<cl_context_properties> context_properties;
context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties,
......@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
std::vector<cl_context_properties> context_properties = {
CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
};
context_ = std::shared_ptr<cl::Context>(
......@@ -399,7 +398,7 @@ OpenCLRuntime::OpenCLRuntime(
if (cached_binary_platform_info != platform_info_) {
if (precompiled_binary_storage_ == nullptr) {
VLOG(1) << "There is no precompiled OpenCL binary in"
" all OpenCL binary paths.";
" all OpenCL binary paths.";
} else {
if (precompiled_binary_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
......@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true;
}
MaceStatus GetProgramSourceByName(const std::string &program_name,
std::string *source) {
MACE_CHECK_NOTNULL(source);
std::stringstream source_stream;
const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
const auto &it_program = kEncryptedProgramMap.find(program_name);
if (it_program == kEncryptedProgramMap.end()) {
LOG(ERROR) << "Find program " << program_name << " failed.";
return MaceStatus::MACE_RUNTIME_ERROR;
}
const std::vector<std::string> &headers = it_program->second.headers_;
for (const std::string &header : headers) {
const auto &header_program = kEncryptedProgramMap.find(header);
if (header_program == kEncryptedProgramMap.end()) {
LOG(WARNING) << "Program header(" << header << ") is empty.";
continue;
}
const auto &header_source = header_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(header_source.begin(), header_source.end()));
}
const auto &it_source = it_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(it_source.begin(), it_source.end()));
*source = source_stream.str();
return MaceStatus::MACE_SUCCESS;
}
bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from source
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source != kEncryptedProgramMap.end()) {
std::string kernel_source;
MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source);
*program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str());
......
......@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
*net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type);
return MaceStatus::MACE_SUCCESS;
......
......@@ -118,9 +118,21 @@ def mace_version_genrule():
)
def encrypt_opencl_kernel_genrule():
native.genrule(
name = "encrypt_opencl_kernel_gen",
srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
outs = ["opencl/encrypt_opencl_kernel.cc"],
cmd = "cat $(SRCS) > $@;"
)
srcs = [
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
)),
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
)),
]
outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
native.genrule(
name = "encrypt_opencl_kernel_gen",
srcs = srcs,
outs = outs,
cmd = " && ".join([
"cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
for i in range(0, len(outs))
]),
)
......@@ -181,7 +181,6 @@ cc_library(
],
)
cc_library(
name = "internal_ops",
srcs = glob(
......@@ -239,10 +238,10 @@ cc_library(
name = "ops",
srcs = [
"registry/ops_registry.cc",
],
],
hdrs = [
"registry/ops_registry.h",
],
],
copts = [
"-Werror",
"-Wextra",
......
......@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ActivationOp<DeviceType::GPU, T> : public Operation {
template<>
class ActivationOp<DeviceType::GPU, float> : public Operation {
public:
explicit ActivationOp(OpConstructContext *context)
: Operation(context) {
ActivationType type = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"));
auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f));
auto leakyrelu_coefficient = static_cast<T>(
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
auto leakyrelu_coefficient =
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
kernel_ = make_unique<opencl::image::ActivationKernel>(
type, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
}
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
......@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -29,10 +29,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class AddNOp;
template <>
template<>
class AddNOp<DeviceType::CPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
......@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class AddNOp<DeviceType::GPU, T> : public Operation {
template<>
class AddNOp<DeviceType::GPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>();
kernel_ = make_unique<opencl::image::AddNKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
......@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchNormOp<DeviceType::GPU, T> : public Operation {
template<>
class BatchNormOp<DeviceType::GPU, float> : public Operation {
public:
explicit BatchNormOp(OpConstructContext *context)
: Operation(context) {
......@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
kernel_ = make_unique<opencl::image::BatchNormKernel>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
void RegisterBatchNorm(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
}
} // namespace ops
......
......@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
}
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class BatchToSpaceNDOp;
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
}
};
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
template<>
class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float);
......@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
}
} // namespace ops
......
......@@ -34,16 +34,16 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BiasAddOp;
template <>
template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
{}
has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
0)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BiasAddOp<DeviceType::GPU, T> : public Operation {
template<>
class BiasAddOp<DeviceType::GPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
......@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
kernel_ = make_unique<opencl::image::BiasAddKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
......@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -23,10 +23,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class ChannelShuffleOp;
template <typename T>
template<typename T>
class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
......@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const int groups_;
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
template<>
class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterChannelShuffle(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1);
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
index_t channels = op->output_shape(0).dims(3);
index_t channels_per_group = channels / groups;
if (groups % 4 != 0 || channels_per_group % 4 != 0) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_PAD_H_
#define MACE_OPS_PAD_H_
#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
#define MACE_OPS_COMMON_PAD_TYPE_H_
namespace mace {
namespace ops {
......@@ -27,4 +27,4 @@ enum PadType {
} // namespace ops
} // namespace mace
#endif // MACE_OPS_PAD_H_
#endif // MACE_OPS_COMMON_PAD_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_POOLING_H_
#define MACE_OPS_POOLING_H_
#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
#define MACE_OPS_COMMON_POOLING_TYPE_H_
namespace mace {
......@@ -23,4 +23,4 @@ enum PoolingType {
};
} // namespace mace
#endif // MACE_OPS_POOLING_H_
#endif // MACE_OPS_COMMON_POOLING_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REDUCE_H_
#define MACE_OPS_REDUCE_H_
#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
#define MACE_OPS_COMMON_REDUCE_TYPE_H_
namespace mace {
......@@ -28,4 +28,4 @@ enum ReduceType {
};
} // namespace mace
#endif // MACE_OPS_REDUCE_H_
#endif // MACE_OPS_COMMON_REDUCE_TYPE_H_
......@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_BICUBIC_H_
#define MACE_OPS_RESIZE_BICUBIC_H_
#ifndef MACE_OPS_COMMON_UTILS_H_
#define MACE_OPS_COMMON_UTILS_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
namespace resize_bicubic {
namespace common {
namespace utils {
constexpr int64_t kTableSize = (1u << 10);
inline float CalculateResizeScale(index_t in_size,
......@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_bicubic
} // namespace utils
} // namespace common
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_BICUBIC_H_
#endif // MACE_OPS_COMMON_UTILS_H_
......@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
int axis_;
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class ConcatOp;
template <typename T>
template<typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
......@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
template<>
class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
kernel_ = make_unique<opencl::image::ConcatKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterConcat(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float);
......@@ -228,51 +227,44 @@ void RegisterConcat(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Concat")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
} else {
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
if (!has_data_format || axis != 3) {
return { DeviceType::CPU };
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return {DeviceType::CPU, DeviceType::GPU};
}
bool divisible_four = true;
for (const std::string &input : op->input()) {
if (tensor_shape_info->find(input)
!= tensor_shape_info->end()) {
divisible_four = divisible_four
&& (tensor_shape_info->at(input)[3] % 4 == 0);
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return {DeviceType::CPU};
} else {
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
if (!has_data_format || axis != 3) {
return {DeviceType::CPU};
}
bool divisible_four = true;
for (const std::string &input : op->input()) {
if (tensor_shape_info->find(input)
!= tensor_shape_info->end()) {
divisible_four = divisible_four
&& (tensor_shape_info->at(input)[3] % 4 == 0);
}
}
// Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) {
return {DeviceType::CPU};
}
}
// Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) {
return { DeviceType::CPU };
}
}
return { DeviceType::CPU, DeviceType::GPU };
}));
return {DeviceType::CPU, DeviceType::GPU};
}));
}
} // namespace ops
......
......@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
template<>
class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
public:
explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context),
......@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Conv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
}
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
......@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
}
} // namespace ops
......
......@@ -24,10 +24,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class CropOp;
template <class T>
template<class T>
class CropOp<DeviceType::CPU, T> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
......@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
......@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_data = output->mutable_data<T>();
const T * input_data = input0->data<T>();
const T *input_data = input0->data<T>();
crop_copy(input_data, output_data, input0->shape(),
output_shape, offsets.data());
......@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
private:
void crop_copy(const T* input_data, T* output_data,
void crop_copy(const T *input_data, T *output_data,
const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape,
const int32_t* offsets) {
const int32_t *offsets) {
const index_t out_img_size =
output_shape[1] * output_shape[2] * output_shape[3];
const index_t out_hw = output_shape[2] * output_shape[3];
......@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
for (int b = 0; b < output_shape[0]; ++b) {
for (int c = 0; c < output_shape[1]; ++c) {
for (int h = 0; h < output_shape[2]; ++h) {
T* out_ptr =
T *out_ptr =
output_data + b * out_img_size + c * out_hw + h * output_shape[3];
const T* in_ptr_bch =
const T *in_ptr_bch =
input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3];
......@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class CropOp<DeviceType::GPU, T> : public Operation {
template<>
class CropOp<DeviceType::GPU, float> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>(
kernel_ = make_unique<opencl::image::CropKernel>(
Operation::GetRepeatedArgs<int>("offset"));
} else {
MACE_NOT_IMPLEMENTED;
......@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
......@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Deconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else {
if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
3,
......@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
......
......@@ -24,7 +24,7 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class DepthToSpaceOp : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
......@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
template<>
class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
}
} // namespace ops
......
......@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
}
Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) {
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
......@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type) == MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, float);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
......@@ -467,8 +464,8 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DataFormat op_data_format =
static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
}));
}
......
......@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
......@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
}
} // namespace ops
......
......@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class EltwiseOp<DeviceType::GPU, T> : public Operation {
template<>
class EltwiseOp<DeviceType::GPU, float> : public Operation {
public:
explicit EltwiseOp(OpConstructContext *context)
: Operation(context) {
......@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
kernel_ = make_unique<opencl::image::EltwiseKernel>(
type, coeff, scalar_input, scalar_input_index);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) {
if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
} else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
}
} // namespace ops
......
......@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
template<>
class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
public:
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
OpenCLBufferType::WEIGHT_WIDTH,
mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
FullyConnectedOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
}
} // namespace ops
......
......@@ -18,7 +18,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class IdentityOp : public Operation {
public:
explicit IdentityOp(OpConstructContext *context)
......@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
};
void RegisterIdentity(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -19,7 +19,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class InferConv2dShapeOp : public Operation {
public:
explicit InferConv2dShapeOp(OpConstructContext *context)
......@@ -66,20 +65,23 @@ class InferConv2dShapeOp : public Operation {
int32_t out_h = 0, out_w = 0;
if (!paddings.empty()) {
out_h = (in_h - kernels[2] + paddings[0]) / strides[0] + 1;
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
} else {
switch (padding_type) {
case SAME:
case SAME: {
out_h = (in_h + strides[0] - 1) / strides[0];
out_w = (in_w + strides[1] - 1) / strides[1];
break;
case VALID:
}
case VALID: {
out_h = (in_h - kernels[2] + 1) / strides[0];
out_w = (in_w - kernels[3] + 1) / strides[1];
break;
default:
}
default: {
MACE_NOT_IMPLEMENTED;
break;
}
}
}
......@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
};
void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, half);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -77,7 +77,7 @@ class MatMulOpBase : public Operation {
} else {
MACE_CHECK(lhs_rank == 2 || rhs_rank == 2,
"Either lhs or rhs matrix should has rank 2 "
"for non-batched matrix multiplication");
"for non-batched matrix multiplication");
}
index_t
......@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
template<>
class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) {
......@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
};
#endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
......@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
......@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t size = input->dim(0);
......@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
return MaceStatus::MACE_SUCCESS;
}
MaceStatus BufferTransform::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, output);
default:
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
class BufferTransform : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
OpContext *context,
......@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, dt, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input, dt, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
......@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(kwg_size_);
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
StatsFuture *future);
} // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
Conv2dKernel() : old_scratch_size_(0) {}
......@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string pad_data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
std::string out_data_dt = DtToCLDt(output->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
}
} // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
StatsFuture *future);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {}
......@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
auto input_dtype = input->dtype();
auto input_dt = DtToCLDt(input_dtype);
built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
auto output_dtype = output->dtype();
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
if (pooling_type == MAX && input_dtype == output_dtype) {
built_options.emplace("-DDATA_TYPE=" + input_dt);
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
PoolingKernel() : old_scratch_size_(0) {}
......@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -29,7 +29,7 @@ namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
explicit SoftmaxKernel(bool use_log)
......@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -20,11 +20,11 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BufferTransformOp;
template <typename T>
class BufferTransformOp<DeviceType::GPU, T> : public Operation {
template<>
class BufferTransformOp<DeviceType::GPU, float> : public Operation {
public:
explicit BufferTransformOp(OpConstructContext *context)
: Operation(context),
......@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
}
......@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType out_mem_type_;
};
void RegisterBufferTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, half);
MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
}
} // namespace ops
......
......@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
return name + postfix;
}
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size) {
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
const DataType dt = input->dtype();
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops
} // namespace mace
......@@ -28,17 +28,16 @@
namespace mace {
namespace ops {
// Only used for GPU Operation(BufferTransform)
template<typename T>
class OpenCLBufferTransformer {
public:
OpenCLBufferTransformer(const MemoryType in_mem_type,
const MemoryType out_mem_type) {
if (out_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BufferToImage<T>>();
kernel_ = make_unique<opencl::image::BufferToImage>();
} else if (in_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
kernel_ = make_unique<opencl::image::ImageToBuffer>();
} else {
kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
kernel_ = make_unique<opencl::buffer::BufferTransform>();
}
}
......@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
const int wino_blk_size,
Tensor *output) {
Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value;
DataType dt = output->dtype();
MemoryType in_mem_type = input->memory_type();
if (out_mem_type == MemoryType::GPU_IMAGE ||
out_mem_type == MemoryType::GPU_BUFFER) {
......@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
<< " to CPU Buffer " << output->name()
<< " with data type " << dt;
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
float *output_ptr = output->mutable_data<float>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
} else {
LOG(FATAL) << "Unexpected error: " << out_mem_type;
......@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
std::string TransformedFilterName(const std::string &name);
template<typename T>
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size = 0) {
const DataType dt = DataTypeToEnum<T>::value;
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
const int wino_blk_size = 0);
} // namespace ops
} // namespace mace
......
......@@ -17,8 +17,9 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
class OpContext;
......
......@@ -17,7 +17,10 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/core/types.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
namespace mace {
......
......@@ -19,6 +19,9 @@
#include <vector>
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/core/types.h"
namespace mace {
......
......@@ -15,8 +15,7 @@
#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/math.h"
......
......@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
}
}
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
......
......@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
// CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select
// e.g. half -> float
std::string DtToCLCondDt(const DataType dt);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/activation.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ActivationKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
switch (activation_) {
case RELU: {
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
}
case RELUX: {
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
}
case PRELU: {
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
}
case TANH: {
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
}
case SIGMOID: {
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
}
case LEAKYRELU: {
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation_;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,12 +31,11 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel {
public:
ActivationKernel(ActivationType type,
T relux_max_limit,
T leakyrelu_coefficient)
float relux_max_limit,
float leakyrelu_coefficient)
: activation_(type), relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
......@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
private:
ActivationType activation_;
T relux_max_limit_;
T leakyrelu_coefficient_;
float relux_max_limit_;
float leakyrelu_coefficient_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
std::string tuning_key_prefix_;
};
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/addn.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus AddNKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel {
public:
MaceStatus Compute(
......@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_norm.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
BatchNormKernel::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
MaceStatus BatchNormKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
break;
case TANH:built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel {
public:
BatchNormKernel(
......@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BatchToSpaceKernel::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = batch_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public:
MaceStatus Compute(
......@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/bias_add.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BiasAddKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel {
public:
MaceStatus Compute(
......@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BufferToImage::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BufferToImage : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
......@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:
kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/channel_shuffle.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ChannelShuffleKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
......@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
auto data_dt = input0->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
......@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
......@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
......@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
}
} // namespace concat
MaceStatus ConcatKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
&input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context,
&kernel_,
input_list,
output,
&kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size);
} // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel {
public:
ConcatKernel() {}
......@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, &kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
const int wino_blk_size,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size[3]);
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
bool CheckUseWinograd(
......@@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
DataTypeToEnum<T>::value,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -66,21 +66,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace
extern MaceStatus Conv2dK1x1(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
MaceStatus Conv2dK1x1(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
MACE_UNUSED(padding);
MACE_UNUSED(dilations);
const index_t batch = output->dim(0);
......@@ -106,31 +105,38 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
......
......@@ -59,21 +59,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace
extern MaceStatus Conv2dK3x3(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
MaceStatus Conv2dK3x3(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -93,29 +92,36 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
......
......@@ -67,21 +67,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace
extern MaceStatus Conv2d(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
MaceStatus Conv2d(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -101,29 +100,36 @@ extern MaceStatus Conv2d(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/crop.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus CropKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output) {
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
for (index_t i = 0; i < in0_dims; ++i) {
if (offset_[i] >= 0) {
output_shape[i] = input1->dim(i);
offsets[i] = offset_[i];
MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
<< "the crop for dimension " << i
<< " is out of bound, first input size "
<< input0->dim(i) << ", offset " << offsets[i]
<< ", second input size " << input1->dim(i);
}
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel"
" offset divisible by 4.");
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = input0->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class CropKernel : public OpenCLCropKernel {
public:
explicit CropKernel(
......@@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus CropKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output) {
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
for (index_t i = 0; i < in0_dims; ++i) {
if (offset_[i] >= 0) {
output_shape[i] = input1->dim(i);
offsets[i] = offset_[i];
MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
<< "the crop for dimension " << i
<< " is out of bound, first input size "
<< input0->dim(i) << ", offset " << offsets[i]
<< ", second input size " << input1->dim(i);
}
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel"
" offset divisible by 4.");
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/deconv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus Deconv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class Deconv2dKernel : public OpenCLDeconv2dKernel {
public:
MaceStatus Compute(
......@@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus Deconv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const DataType dt = DataTypeToEnum<T>::value;
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/depth_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus DepthToSpaceKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size ",
input_depth);
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
"output channel not support:") << output_depth;
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
uint32_t gws[3];
if (output_depth < 3) {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
} else {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "depth_to_space";
if (output_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", output_depth));
if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
public:
explicit DepthToSpaceKernel(const int block_size)
......@@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size ",
input_depth);
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
"output channel not support:") << output_depth;
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
uint32_t gws[3];
if (output_depth < 3) {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
} else {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "depth_to_space";
if (output_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", output_depth));
if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
} else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace(MakeString("-DSTRIDE=", stride));
switch (activation) {
......@@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context,
}
} // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< " is not implemented yet, using slow version";
MACE_NOT_IMPLEMENTED;
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return depthwise::DepthwiseConv2d(
context, &kernel_, input, filter, bias, strides[0], paddings.data(),
dilations, activation, relux_max_limit, leakyrelu_coefficient,
&input_shape_, output, &kwg_size_);
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
MaceStatus Compute(
......@@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< " is not implemented yet, using slow version";
MACE_NOT_IMPLEMENTED;
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return depthwise::DepthwiseConv2d(
context, &kernel_, input, filter, bias, strides[0], paddings.data(),
dilations, activation, relux_max_limit, leakyrelu_coefficient,
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/depthwise_deconv2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus DepthwiseDeconv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const int group,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
const index_t batch = output_shape[0];
const index_t height = output_shape[1];
const index_t width = output_shape[2];
const index_t channels = output_shape[3];
const index_t input_channels = input->dim(3);
const index_t multiplier = filter->dim(0);
MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
"opencl image deconv only supports depthwise type group.");
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t channel_blocks = RoundUpDiv4(channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("depthwise_deconv2d_kernel_",
activation,
output->dim(0),
output->dim(1),
output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
public:
MaceStatus Compute(
......@@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const int group,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const std::vector<index_t> &output_shape,
Tensor *output) {
const index_t batch = output_shape[0];
const index_t height = output_shape[1];
const index_t width = output_shape[2];
const index_t channels = output_shape[3];
const index_t input_channels = input->dim(3);
const index_t multiplier = filter->dim(0);
MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
"opencl image deconv only supports depthwise type group.");
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const DataType dt = DataTypeToEnum<T>::value;
const index_t channel_blocks = RoundUpDiv4(channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
const int width_tile = 5;
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + width_tile - 1) / width_tile) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("depthwise_deconv2d_kernel_",
activation,
output->dim(0),
output->dim(1),
output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/eltwise.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus EltwiseKernel::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
bool swapped = false;
std::string input1_type = "";
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1
|| (input1->dim(0) == 1 && input1->dim(1) == 1
&& input1->dim(2) == 1)) {
// Tensor-Vector element wise
if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
input1_type = "INPUT_VECTOR";
} else {
LOG(FATAL) << "Inputs not match the broadcast logic, "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
} else { // must be 4-D
if (input0->dim(0) == input1->dim(0)
&& input1->dim(1) == 1
&& input1->dim(2) == 1
&& input0->dim(3) == input1->dim(3)) {
input1_type = "INPUT_BATCH_VECTOR";
} else if (input0->dim(0) == input1->dim(0)
&& input0->dim(1) == input1->dim(1)
&& input0->dim(2) == input1->dim(2)
&& input1->dim(3) == 1) {
// broadcast on channel dimension
input1_type = "INPUT_TENSOR_BC_CHAN";
} else {
LOG(FATAL) << "Element-Wise op only support broadcast on"
" channel dimension:"
"Tensor-BatchVector(4D-[N,1,1,C]) "
"and Tensor-Tensor(4D-[N,H,W,1]). but got "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!input1_type.empty()) {
built_options.emplace("-D" + input1_type);
}
if (swapped) built_options.emplace("-DSWAPPED");
if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/eltwise.h"
#include "mace/ops/common/eltwise_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -32,7 +32,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class EltwiseKernel : public OpenCLEltwiseKernel {
public:
explicit EltwiseKernel(
......@@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus EltwiseKernel<T>::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
bool swapped = false;
std::string input1_type = "";
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1
|| (input1->dim(0) == 1 && input1->dim(1) == 1
&& input1->dim(2) == 1)) {
// Tensor-Vector element wise
if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
input1_type = "INPUT_VECTOR";
} else {
LOG(FATAL) << "Inputs not match the broadcast logic, "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
} else { // must be 4-D
if (input0->dim(0) == input1->dim(0)
&& input1->dim(1) == 1
&& input1->dim(2) == 1
&& input0->dim(3) == input1->dim(3)) {
input1_type = "INPUT_BATCH_VECTOR";
} else if (input0->dim(0) == input1->dim(0)
&& input0->dim(1) == input1->dim(1)
&& input0->dim(2) == input1->dim(2)
&& input1->dim(3) == 1) {
// broadcast on channel dimension
input1_type = "INPUT_TENSOR_BC_CHAN";
} else {
LOG(FATAL) << "Element-Wise op only support broadcast on"
" channel dimension:"
"Tensor-BatchVector(4D-[N,1,1,C]) "
"and Tensor-Tensor(4D-[N,H,W,1]). but got "
<< MakeString(input0->shape()) << " vs "
<< MakeString(input1->shape());
}
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!input1_type.empty()) {
built_options.emplace("-D" + input1_type);
}
if (swapped) built_options.emplace("-DSWAPPED");
if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/fully_connected.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus FullyConnectedKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, &kernel_));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
} else {
gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
}
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws_[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel_.setArg(idx++, static_cast<int>(output_blocks));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws_.size());
for (size_t i = 0; i < lws_.size(); ++i) {
roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
}
MACE_OUT_OF_RANGE_VALIDATION;
MACE_CL_RET_STATUS(error);
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -23,6 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -30,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
public:
MaceStatus Compute(
......@@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus FullyConnectedKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, &kernel_));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
} else {
gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
}
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws_[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel_.setArg(idx++, static_cast<int>(output_blocks));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, leakyrelu_coefficient);
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws_.size());
for (size_t i = 0; i < lws_.size(); ++i) {
roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
}
MACE_OUT_OF_RANGE_VALIDATION;
MACE_CL_RET_STATUS(error);
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/image_to_buffer.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ImageToBuffer::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_image_to_buffer";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_image_to_buffer";
break;
case ARGUMENT:kernel_name = "arg_image_to_buffer";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_image_to_buffer";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
case WEIGHT_HEIGHT:kernel_name = "weight_height_image_to_buffer";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_image_to_buffer";
break;
case DW_CONV2D_FILTER:
case IN_OUT_WIDTH:LOG(FATAL)
<< "IN_OUT_WIDTH only support buffer to image now";
break;
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (output->dtype() == input->dtype()) {
auto data_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(output->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t
inner_size = output->dim(1) * output->dim(2) * output->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(input->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -28,7 +28,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ImageToBuffer : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(OpContext *context,
......@@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_image_to_buffer";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_image_to_buffer";
break;
case ARGUMENT:
kernel_name = "arg_image_to_buffer";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_image_to_buffer";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
case WEIGHT_HEIGHT:
kernel_name = "weight_height_image_to_buffer";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_image_to_buffer";
break;
case DW_CONV2D_FILTER:
case IN_OUT_WIDTH:
LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now";
break;
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (output->dtype() == input->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(output->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t
inner_size = output->dim(1) * output->dim(2) * output->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(input->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/lstm_cell.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus LSTMCellKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
"LSTM hidden units should be a multiple of 4");
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape_padded,
OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, forget_bias_);
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,11 +30,10 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class LSTMCellKernel : public OpenCLLSTMCellKernel {
public:
explicit LSTMCellKernel(
const T forget_bias)
const float forget_bias)
: forget_bias_(forget_bias) {}
MaceStatus Compute(
OpContext *context,
......@@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
Tensor *output) override;
private:
T forget_bias_;
float forget_bias_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus LSTMCellKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
"LSTM hidden units should be a multiple of 4");
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape_padded,
OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, static_cast<float>(forget_bias_));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/matmul.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus MatMulKernel::Compute(
OpContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b) {
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
OpenCLUtil::CalImage2DShape(padded_c_shape,
OpenCLBufferType::IN_OUT_HEIGHT,
&c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class MatMulKernel : public OpenCLMatMulKernel {
public:
MaceStatus Compute(
......@@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel {
uint32_t kwg_size_;
};
template <typename T>
MaceStatus MatMulKernel<T>::Compute(
OpContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b) {
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
OpenCLUtil::CalImage2DShape(padded_c_shape,
OpenCLBufferType::IN_OUT_HEIGHT,
&c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/pad.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus PadKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
for (int i = 2; i <= 5; ++i) {
MACE_CHECK(paddings_[i] >= 0);
}
auto input_shape = input->shape();
if (type_ == PadType::REFLECT) {
MACE_CHECK(paddings_[2] < input_shape[1] &&
paddings_[3] < input_shape[1] &&
paddings_[4] < input_shape[2] &&
paddings_[5] < input_shape[2]);
} else if (type_ == PadType::SYMMETRIC) {
MACE_CHECK(paddings_[2] <= input_shape[1] &&
paddings_[3] <= input_shape[1] &&
paddings_[4] <= input_shape[2] &&
paddings_[5] <= input_shape[2]);
} else {
MACE_CHECK(type_ == PadType::CONSTANT);
}
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(MakeString("-DPAD_TYPE=", type_));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
if (type_ == PadType::CONSTANT) {
kernel_.setArg(idx++, this->constant_value_);
}
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/pad.h"
#include "mace/ops/common/pad_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class PadKernel : public OpenCLPadKernel {
public:
PadKernel(const PadType type,
......@@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PadKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
for (int i = 2; i <= 5; ++i) {
MACE_CHECK(paddings_[i] >= 0);
}
auto input_shape = input->shape();
if (type_ == PadType::REFLECT) {
MACE_CHECK(paddings_[2] < input_shape[1] &&
paddings_[3] < input_shape[1] &&
paddings_[4] < input_shape[2] &&
paddings_[5] < input_shape[2]);
} else if (type_ == PadType::SYMMETRIC) {
MACE_CHECK(paddings_[2] <= input_shape[1] &&
paddings_[3] <= input_shape[1] &&
paddings_[4] <= input_shape[2] &&
paddings_[5] <= input_shape[2]);
} else {
MACE_CHECK(type_ == PadType::CONSTANT);
}
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(MakeString("-DPAD_TYPE=", type_));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
if (type_ == PadType::CONSTANT) {
kernel_.setArg(idx++, this->constant_value_);
}
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
auto data_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides[0]);
kernel_.setArg(idx++, strides[1]);
kernel_.setArg(idx++, kernels[0]);
kernel_.setArg(idx++, kernels[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -57,7 +57,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace pooling
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
MaceStatus Compute(
......@@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides[0]);
kernel_.setArg(idx++, strides[1]);
kernel_.setArg(idx++, kernels[0]);
kernel_.setArg(idx++, kernels[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/reduce.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ReduceKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
built_options.emplace("-Dreduce=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
// Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
}
lws = {gws[0], gws[1], 1};
const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements.
const int compute_size = (image_size + group_num - 1) / group_num;
const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -24,20 +24,18 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/ops/reduce.h"
#include "mace/ops/common/reduce_type.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ReduceKernel : public OpenCLReduceKernel {
public:
ReduceKernel(ReduceType type,
const std::vector<int> &axis,
const bool keep_dims)
: reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {}
const std::vector<int> &axis)
: reduce_type_(type), axis_(axis) {}
MaceStatus Compute(
OpContext *context,
......@@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel {
private:
ReduceType reduce_type_;
const std::vector<int> axis_;
bool keep_dims_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ReduceKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
built_options.emplace("-Dreduce=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
// Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
}
lws = {gws[0], gws[1], 1};
const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements.
const int compute_size = (image_size + group_num - 1) / group_num;
const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_bicubic.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeBicubicKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(
MakeString("-DTABLE_SIZE=", common::utils::kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
common::utils::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -25,13 +25,14 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_bicubic.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
namespace resize_bicubic {
constexpr int64_t kTableSize = (1u << 10);
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
......@@ -60,7 +61,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_bicubic
template <typename T>
class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
public:
ResizeBicubicKernel(bool align_corners,
......@@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ResizeBicubicKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(
MakeString("-DTABLE_SIZE=",
mace::ops::resize_bicubic::kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_bicubic::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
mace::ops::resize_bicubic::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_bilinear.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeBilinearKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -25,7 +25,6 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_bilinear.h"
namespace mace {
namespace ops {
......@@ -65,12 +64,11 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_bilinear
template <typename T>
class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
public:
ResizeBilinearKernel(bool align_corners,
const index_t out_height,
const index_t out_width)
const index_t out_height,
const index_t out_width)
: align_corners_(align_corners),
out_height_(out_height),
out_width_(out_width) {}
......@@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ResizeBilinearKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_bilinear::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
mace::ops::resize_bilinear::CalculateResizeScale(in_width,
out_width,
align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
#include "mace/ops/common/utils.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ResizeNearestNeighborKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *size,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size);
const index_t out_height = size->data<int32_t>()[0];
const index_t out_width = size->data<int32_t>()[1];
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
"resize_nearest_neighbor_nocache");
built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_nearest_neighbor",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
common::utils::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
common::utils::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -25,7 +25,6 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/ops/resize_nearest_neighbor.h"
namespace mace {
namespace ops {
......@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace resize_nearest_neighbor
template <typename T>
class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
public:
explicit ResizeNearestNeighborKernel(bool align_corners)
......@@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ResizeNearestNeighborKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *size,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size);
const index_t out_height = size->data<int32_t>()[0];
const index_t out_width = size->data<int32_t>()[1];
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
"resize_nearest_neighbor_nocache");
built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_nearest_neighbor",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
mace::ops::resize_nearest_neighbor::CalculateResizeScale(
in_height, out_height, align_corners_);
float width_scale =
mace::ops::resize_nearest_neighbor::CalculateResizeScale(
in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (use_log_)
built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -56,7 +56,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
}
} // namespace softmax
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
explicit SoftmaxKernel(bool use_log)
......@@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (use_log_)
built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/space_to_batch.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SpaceToBatchKernel::Compute(
OpContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto input_dt = space_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, space_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
public:
MaceStatus Compute(
......@@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SpaceToBatchKernel<T>::Compute(
OpContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, space_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/space_to_depth.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SpaceToDepthKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
if (input_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", input_depth));
}
built_options.emplace(kernel_name_ss.str());
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
public:
explicit SpaceToDepthKernel(const int block_size)
......@@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SpaceToDepthKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
if (input_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", input_depth));
}
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/split.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SplitKernel::Compute(
OpContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list) {
MACE_UNUSED(axis_);
const index_t input_channels = input->dim(3);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (context->future() != nullptr) {
context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class SplitKernel : public OpenCLSplitKernel {
public:
explicit SplitKernel(const int32_t axis) : axis_(axis) {}
......@@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel {
uint32_t kwg_size_;
};
template <typename T>
MaceStatus SplitKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list) {
const index_t input_channels = input->dim(3);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (context->future() != nullptr) {
context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/sqrdiff_mean.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus SqrDiffMeanKernel::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
MACE_CHECK_NOTNULL(input0);
MACE_CHECK_NOTNULL(input1);
MACE_CHECK(input0->dim(0) == input1->dim(0) &&
input0->dim(3) == input1->dim(3));
MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
"SqrDiffMean gpu only support 4-dim input");
index_t batch = input0->dim(0);
const index_t in_height = input0->dim(1);
const index_t in_width = input0->dim(2);
const index_t channels = input0->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float img_size_reciprocal = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
public:
MaceStatus Compute(
......@@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SqrDiffMeanKernel<T>::Compute(
OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
MACE_CHECK_NOTNULL(input0);
MACE_CHECK_NOTNULL(input1);
MACE_CHECK(input0->dim(0) == input1->dim(0) &&
input0->dim(3) == input1->dim(3));
MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
"SqrDiffMean gpu only support 4-dim input");
index_t batch = input0->dim(0);
const index_t in_height = input0->dim(1);
const index_t in_width = input0->dim(2);
const index_t channels = input0->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float img_size_reciprocal = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -29,7 +29,6 @@ namespace {
MaceStatus WinogradInputTransform(OpContext *context,
cl::Kernel *kernel,
const Tensor *input_tensor,
const DataType dt,
const int *paddings,
const index_t round_h,
const index_t round_w,
......@@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context,
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MaceStatus::MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
......@@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context,
kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
}
const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
......@@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context,
cl::Kernel *kernel,
const Tensor *input_tensor,
const Tensor *bias,
const DataType dt,
const index_t round_h,
const index_t round_w,
const int wino_blk_size,
......@@ -145,32 +142,40 @@ MaceStatus WinogradOutputTransform(OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
}
case PRELU: {
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
......@@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
const int wino_blk_size,
std::vector<index_t> *prev_input_shape,
Tensor *output,
......@@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
OpenCLBufferType::IN_OUT_HEIGHT,
&t_input_image_shape);
ScratchImage transformed_input_image(scratch_manager);
std::unique_ptr<Tensor> transformed_input = make_unique<Tensor>(
transformed_input_image.Scratch(context->device()->allocator(),
t_input_image_shape, dt), dt);
auto input_dt = input->dtype();
auto image = transformed_input_image.Scratch(context->device()->allocator(),
t_input_image_shape, input_dt);
auto transformed_input = make_unique<Tensor>(image, input_dt);
MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
t_input_image_shape));
MACE_RETURN_IF_ERROR(WinogradInputTransform(
context, kernels[0], input, dt, paddings,
context, kernels[0], input, paddings,
round_h, round_w, wino_blk_size,
input_changed, transformed_input.get(),
kwg_size[0], &t_input_future));
......@@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
&mm_output_image_shape);
ScratchImage mm_output_image(scratch_manager);
auto output_dt = input->dtype();
std::unique_ptr<Tensor> mm_output = make_unique<Tensor>(
mm_output_image.Scratch(context->device()->allocator(),
mm_output_image_shape, dt), dt);
mm_output_image_shape, output_dt), output_dt);
MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
mm_output_image_shape));
......@@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, kernels[1]));
......@@ -334,7 +340,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
const std::vector<uint32_t> lws = {*kwg_size[1] / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", mm_output_shape[0],
mm_output_shape[1], mm_output_shape[2]);
mm_output_shape[1], mm_output_shape[2]);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernels[1], tuning_key,
gws, lws, &mm_future));
......@@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
// t_output (blk_sqr, out_chan, out_width) -> output(NHWC)
MACE_RETURN_IF_ERROR(WinogradOutputTransform(
context, kernels[2], mm_output.get(), bias,
dt, round_h, round_w, wino_blk_size, activation, relux_max_limit,
round_h, round_w, wino_blk_size, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, kwg_size[2],
&t_output_future))
......
......@@ -25,21 +25,20 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class LSTMCellOp;
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class LSTMCellOp<DeviceType::GPU, T> : public Operation {
template<>
class LSTMCellOp<DeviceType::GPU, float> : public Operation {
public:
explicit LSTMCellOp(OpConstructContext *context)
: Operation(context) {
T forget_bias = static_cast<T>(
Operation::GetOptionalArg<float>("scalar_input",
0.0));
float forget_bias = Operation::GetOptionalArg<float>("scalar_input",
0.0);
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
kernel_ = make_unique<opencl::image::LSTMCellKernel>(forget_bias);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -47,30 +46,26 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
const Tensor *pre_output = context->workspace()->GetTensor(
operator_def_->input(1));
if (pre_output->is_weight()) {
MACE_CHECK(TransformFilter<T>(context,
operator_def_.get(),
1,
OpenCLBufferType::IN_OUT_CHANNEL,
mem_type) == MaceStatus::MACE_SUCCESS);
auto status = TransformFilter(context, operator_def_.get(),
1, OpenCLBufferType::IN_OUT_CHANNEL,
mem_type);
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
}
MACE_CHECK(TransformFilter<T>(context,
operator_def_.get(),
2,
OpenCLBufferType::IN_OUT_CHANNEL,
mem_type) == MaceStatus::MACE_SUCCESS);
MACE_CHECK(TransformFilter<T>(context,
operator_def_.get(),
3,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
const Tensor *pre_cell = context->workspace()->GetTensor(
operator_def_->input(4));
auto status = TransformFilter(context, operator_def_.get(),
2, OpenCLBufferType::IN_OUT_CHANNEL,
mem_type);
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
status = TransformFilter(context, operator_def_.get(),
3, OpenCLBufferType::ARGUMENT,
mem_type);
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
const Tensor *pre_cell =
context->workspace()->GetTensor(operator_def_->input(4));
if (pre_cell->is_weight()) {
MACE_CHECK(TransformFilter<T>(context,
operator_def_.get(),
4,
OpenCLBufferType::IN_OUT_CHANNEL,
mem_type) == MaceStatus::MACE_SUCCESS);
status = TransformFilter(context, operator_def_.get(),
4, OpenCLBufferType::IN_OUT_CHANNEL,
mem_type);
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
}
}
......@@ -92,14 +87,10 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
};
#endif
#endif // MACE_ENABLE_OPENCL
void RegisterLSTMCell(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
DeviceType::GPU, half);
MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp);
}
} // namespace ops
......
......@@ -17,7 +17,7 @@
#include <vector>
#include "mace/ops/pooling.h"
#include "mace/ops/common/pooling_type.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
......
......@@ -16,7 +16,7 @@
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/pad.h"
#include "mace/ops/common/pad_type.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/pad.h"
#endif // MACE_ENABLE_OPENCL
......@@ -26,10 +26,10 @@
namespace mace {
namespace ops {
template <DeviceType D, typename T>
template<DeviceType D, typename T>
class PadOp;
template <typename T>
template<typename T>
class PadOp<DeviceType::CPU, T> : public Operation {
public:
explicit PadOp(OpConstructContext *context)
......@@ -88,12 +88,12 @@ class PadOp<DeviceType::CPU, T> : public Operation {
for (index_t c = 0; c < channel; ++c) {
for (index_t h = 0; h < height; ++h) {
const index_t in_offset = (((b * channel + c) * height) +
h) * width;
h) * width;
const index_t out_offset =
(((b + this->paddings_[0]) * output->dim(1)
+ (c + this->paddings_[2])) * output->dim(2)
+ (h + this->paddings_[4])) * output->dim(3)
+ this->paddings_[6];
(((b + this->paddings_[0]) * output->dim(1)
+ (c + this->paddings_[2])) * output->dim(2)
+ (h + this->paddings_[4])) * output->dim(3)
+ this->paddings_[6];
memcpy(output_ptr + out_offset,
input_ptr + in_offset,
width * sizeof(T));
......@@ -101,11 +101,11 @@ class PadOp<DeviceType::CPU, T> : public Operation {
}
}
} else if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) {
const index_t o_batch = output->dim(0);
const index_t o_batch = output->dim(0);
const index_t o_channel = output->dim(1);
const index_t o_height = output->dim(2);
const index_t o_width = output->dim(3);
const int l_add = type_ == PadType::REFLECT ? 0 : -1;
const index_t o_height = output->dim(2);
const index_t o_width = output->dim(3);
const int l_add = type_ == PadType::REFLECT ? 0 : -1;
const int r_add = type_ == PadType::REFLECT ? -2 : -1;
for (index_t h = 0; h < o_height; ++h) {
......@@ -116,10 +116,10 @@ class PadOp<DeviceType::CPU, T> : public Operation {
for (index_t c = 0; c < o_channel; ++c) {
index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add);
const index_t in_offset = (((b_in * channel + c_in) * height) +
h_in) * width;
index_t out_offset = (((b * o_channel + c) * o_height) +
h) * o_width;
const index_t in_offset =
(((b_in * channel + c_in) * height) + h_in) * width;
index_t out_offset =
(((b * o_channel + c) * o_height) + h) * o_width;
for (index_t i = 0, j = paddings_[6] + l_add;
i < paddings_[6]; ++i, --j) {
......@@ -169,8 +169,8 @@ class PadOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class PadOp<DeviceType::GPU, T> : public Operation {
template<>
class PadOp<DeviceType::GPU, float> : public Operation {
public:
explicit PadOp(OpConstructContext *context)
: Operation(context) {
......@@ -180,7 +180,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
float constant_value = Operation::GetOptionalArg<float>(
"constant_value", 0.0);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PadKernel<T>>(
kernel_ = make_unique<opencl::image::PadKernel>(
type, paddings, constant_value);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -198,18 +198,11 @@ class PadOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterPad(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
}
} // namespace ops
......
......@@ -16,8 +16,6 @@
#include <arm_neon.h>
#endif
#include "mace/ops/pooling.h"
#include <algorithm>
#include <limits>
#include <memory>
......@@ -28,6 +26,7 @@
#include "mace/core/tensor.h"
#include "mace/ops/conv_pool_2d_base.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/common/pooling_type.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/pooling.h"
#include "mace/ops/opencl/buffer/pooling.h"
......@@ -486,15 +485,15 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
template<>
class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
public:
explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
kernel_ = make_unique<opencl::image::PoolingKernel>();
} else {
kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
kernel_ = make_unique<opencl::buffer::PoolingKernel>();
}
}
MaceStatus Run(OpContext *context) override {
......@@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Pooling", PoolingOp);
}
} // namespace ops
......
......@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/reduce.h"
#include <algorithm>
#include <memory>
#include <set>
#include <vector>
#include "mace/ops/common/reduce_type.h"
#include "mace/core/future.h"
#include "mace/core/operator.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
......@@ -868,15 +867,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
template<>
class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
public:
explicit ReduceOp(OpConstructContext *context)
: ReduceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
axis_,
keep_dims_);
kernel_ = make_unique<opencl::image::ReduceKernel>(reduce_type_,
axis_);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Reduce", ReduceOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Reduce")
......@@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
bool keep_dims =
ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
*op, "keepdims", false);
if (!keep_dims) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
auto axis =
ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
*op, "axis");
if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
auto tensor_shape_info = context->tensor_shape_info();
if (tensor_shape_info->count(op->input(0)) == 0
|| tensor_shape_info->at(op->input(0)).size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -12,14 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/resize_bicubic.h"
#include <algorithm>
#include <cmath>
#include <memory>
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_bicubic.h"
#endif // MACE_ENABLE_OPENCL
......@@ -33,12 +32,12 @@ inline const std::shared_ptr<float> InitCoeffsTable() {
// convolution algorithm.
// https://en.wikipedia.org/wiki/Bicubic_interpolation
auto coeffs_tab = std::shared_ptr<float>(
new float[(resize_bicubic::kTableSize + 1) * 2],
new float[(common::utils::kTableSize + 1) * 2],
std::default_delete<float[]>());
float *coeffs_tab_ptr = coeffs_tab.get();
static const float A = -0.75f;
for (int i = 0; i <= resize_bicubic::kTableSize; ++i) {
float x = i * 1.0f / resize_bicubic::kTableSize;
for (int i = 0; i <= common::utils::kTableSize; ++i) {
float x = i * 1.0f / common::utils::kTableSize;
coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
x += 1.0;
coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
......@@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
std::vector<int64_t> *indices) {
auto in_loc = static_cast<int64_t>(scale * out_loc);
const float delta = scale * out_loc - in_loc;
const int64_t offset = lrintf(delta * resize_bicubic::kTableSize);
const int64_t offset = lrintf(delta * common::utils::kTableSize);
const float *coeffs_tab = GetCoeffsTable();
*weights = {coeffs_tab[offset * 2 + 1],
coeffs_tab[offset * 2],
coeffs_tab[(resize_bicubic::kTableSize - offset) * 2],
coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]};
coeffs_tab[(common::utils::kTableSize - offset) * 2],
coeffs_tab[(common::utils::kTableSize - offset) * 2 + 1]};
*indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
}
......@@ -173,13 +172,13 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
}
float height_scale =
resize_bicubic::CalculateResizeScale(in_height,
out_height,
align_corners_);
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
resize_bicubic::CalculateResizeScale(in_width,
out_width,
align_corners_);
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
ResizeImage(context,
input_data,
......@@ -202,8 +201,8 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
template<>
class ResizeBicubicOp<DeviceType::GPU, float> : public Operation {
public:
explicit ResizeBicubicOp(OpConstructContext *context)
: Operation(context) {
......@@ -213,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
"size", {-1, -1});
MACE_CHECK(size.size() == 2);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
kernel_ = make_unique<opencl::image::ResizeBicubicKernel>(
align_corners, size[0], size[1]);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ResizeBicubic", ResizeBicubicOp);
}
} // namespace ops
......
......@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/resize_bilinear.h"
#include <algorithm>
#include <memory>
#include <vector>
......@@ -21,6 +19,7 @@
#include "mace/core/operator.h"
#include "mace/utils/memory.h"
#include "mace/core/quantize.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_bilinear.h"
#endif // MACE_ENABLE_OPENCL
......@@ -223,13 +222,13 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
}
float height_scale =
resize_bilinear::CalculateResizeScale(in_height,
out_height,
align_corners_);
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
resize_bilinear::CalculateResizeScale(in_width,
out_width,
align_corners_);
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
std::vector<CachedInterpolation> ys(out_height + 1);
std::vector<CachedInterpolation> xs(out_width + 1);
......@@ -299,13 +298,13 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
}
float height_scale =
resize_bilinear::CalculateResizeScale(in_height,
out_height,
align_corners_);
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
resize_bilinear::CalculateResizeScale(in_width,
out_width,
align_corners_);
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
std::vector<CachedInterpolation> ys(out_height + 1);
std::vector<CachedInterpolation> xs(out_width + 1);
......@@ -336,8 +335,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
template<>
class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
public:
explicit ResizeBilinearOp(OpConstructContext *context)
: Operation(context) {
......@@ -347,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
"size", {-1, -1});
MACE_CHECK(size.size() == 2);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
kernel_ = make_unique<opencl::image::ResizeBilinearKernel>(
align_corners, size[0], size[1]);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ResizeBilinear", ResizeBilinearOp);
}
} // namespace ops
......
......@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/resize_nearest_neighbor.h"
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/common/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
#endif // MACE_ENABLE_OPENCL
......@@ -115,13 +114,13 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
}
float height_scale =
resize_nearest_neighbor::CalculateResizeScale(in_height,
out_height,
align_corners_);
common::utils::CalculateResizeScale(in_height,
out_height,
align_corners_);
float width_scale =
resize_nearest_neighbor::CalculateResizeScale(in_width,
out_width,
align_corners_);
common::utils::CalculateResizeScale(in_width,
out_width,
align_corners_);
ResizeImageNCHW(context,
input_data,
batch,
......@@ -142,15 +141,15 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
template<>
class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
public:
explicit ResizeNearestNeighborOp(OpConstructContext *context)
: Operation(context) {
bool align_corners = Operation::GetOptionalArg<bool>(
"align_corners", false);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel>(
align_corners);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp);
}
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
namespace resize_nearest_neighbor {
inline float CalculateResizeScale(index_t in_size,
index_t out_size,
bool align_corners) {
return (align_corners && out_size > 1)
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_nearest_neighbor
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
......@@ -35,10 +35,10 @@
namespace mace {
namespace ops {
template <DeviceType D, typename T>
template<DeviceType D, typename T>
class SoftmaxOp;
template <>
template<>
class SoftmaxOp<DeviceType::CPU, float> : public Operation {
public:
explicit SoftmaxOp(OpConstructContext *context)
......@@ -139,12 +139,12 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
sum = std::max(sum, std::numeric_limits<float>::min());
if (use_log_) {
for (index_t c = 0; c < class_count; ++c) {
output_ptr[c] /= sum;
output_ptr[c] /= sum;
output_ptr[c] = std::log(output_ptr[c]);
}
} else {
for (index_t c = 0; c < class_count; ++c) {
output_ptr[c] /= sum;
output_ptr[c] /= sum;
}
}
}
......@@ -407,17 +407,17 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class SoftmaxOp<DeviceType::GPU, T> : public Operation {
template<>
class SoftmaxOp<DeviceType::GPU, float> : public Operation {
public:
explicit SoftmaxOp(OpConstructContext *context)
: Operation(context) {
bool use_log = (
Operation::GetOptionalArg<bool>("use_log", false));
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
kernel_ = make_unique<opencl::image::SoftmaxKernel>(use_log);
} else {
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
kernel_ = make_unique<opencl::buffer::SoftmaxKernel>(use_log);
}
}
MaceStatus Run(OpContext *context) override {
......@@ -433,7 +433,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterSoftmax(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU, float);
......@@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Softmax", SoftmaxOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......@@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
if (op->output_shape(0).dims_size() != 2 &&
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation {
}
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class SpaceToBatchNDOp;
template <>
template<>
class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
public:
explicit SpaceToBatchNDOp(OpConstructContext *context)
......@@ -302,13 +302,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
template<>
class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
public:
explicit SpaceToBatchNDOp(OpConstructContext *context)
: SpaceToBatchOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
kernel_ = make_unique<opencl::image::SpaceToBatchKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp);
}
} // namespace ops
......
......@@ -24,7 +24,7 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class SpaceToDepthOp : public Operation {
public:
explicit SpaceToDepthOp(OpConstructContext *context)
......@@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
template<>
class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
public:
explicit SpaceToDepthOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
kernel_ = make_unique<opencl::image::SpaceToDepthKernel>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
}
} // namespace ops
......
......@@ -100,14 +100,14 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class SplitOp<DeviceType::GPU, T> : public Operation {
template<>
class SplitOp<DeviceType::GPU, float> : public Operation {
public:
explicit SplitOp(OpConstructContext *context)
: Operation(context) {
int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
kernel_ = make_unique<opencl::image::SplitKernel>(axis);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -24,7 +24,7 @@
namespace mace {
namespace ops {
template <DeviceType D, typename T>
template<DeviceType D, typename T>
class SqrDiffMeanOp : public Operation {
public:
explicit SqrDiffMeanOp(OpConstructContext *context)
......@@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation {
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
template<>
class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
public:
explicit SqrDiffMeanOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
kernel_ = make_unique<opencl::image::SqrDiffMeanKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -101,18 +100,11 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
}
} // namespace ops
......
......@@ -20,18 +20,21 @@
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class SqueezeOp : public Operation {
class SqueezeOpRaw : public Operation {
public:
explicit SqueezeOp(OpConstructContext *context)
explicit SqueezeOpRaw(OpConstructContext *context,
DeviceType device_type,
DataType data_type)
: Operation(context),
axis_(Operation::GetRepeatedArgs<int>("axis", {})),
checked_(false) {}
checked_(false),
data_type_(data_type),
device_type_(device_type) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
if (!checked_ && D == DeviceType::CPU
&& DataTypeToEnum<T>::value != DT_UINT8) {
if (!checked_ && device_type_ == DeviceType::CPU
&& data_type_ != DT_UINT8) {
auto has_df = Operation::GetOptionalArg<int>(
"has_data_format", 0);
if (has_df && this->Input(0)->dim_size() == 4) {
......@@ -62,6 +65,16 @@ class SqueezeOp : public Operation {
private:
std::vector<int> axis_;
bool checked_;
DataType data_type_;
DeviceType device_type_;
};
template<DeviceType D, typename T>
class SqueezeOp : public SqueezeOpRaw {
public:
explicit SqueezeOp(OpConstructContext *context)
: SqueezeOpRaw(context, D, DataTypeToEnum<T>::value) {
}
};
void RegisterSqueeze(OpRegistryBase *op_registry) {
......@@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Squeeze", SqueezeOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Squeeze")
......@@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
if (op->output_shape(0).dims_size() != 2 &&
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -37,55 +37,73 @@ def encrypt_code(code_str):
return encrypted_arr
def create_output_dir(dir_path):
if os.path.exists(dir_path):
if os.path.isdir(dir_path):
try:
shutil.rmtree(dir_path)
except OSError:
raise RuntimeError(
"Cannot delete directory %s due to permission "
"error, inspect and remove manually" % dir_path)
else:
raise RuntimeError(
"Cannot delete non-directory %s, inspect ",
"and remove manually" % dir_path)
os.makedirs(dir_path)
def write_cl_encrypted_kernel_to_file(
encrypted_code_maps, template_path, output_path):
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cl_encrypted_kernel = env.get_template(template_path).render(
tag='codegen',
maps=encrypted_code_maps,
data_type='unsigned char',
variable_name='kEncryptedProgramMap')
with open(output_path, "w") as w_file:
w_file.write(cl_encrypted_kernel)
def get_module_key(file_name):
module_key = None
if file_name[-3:] == ".cl":
module_key = file_name[:-3]
elif file_name[-2:] == ".h":
module_key = file_name
return module_key
def encrypt_opencl_codegen(cl_kernel_dir, output_path):
if not os.path.exists(cl_kernel_dir):
print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!")
header_code = ""
for file_name in os.listdir(cl_kernel_dir):
file_path = os.path.join(cl_kernel_dir, file_name)
if file_path[-2:] == ".h":
with open(file_path, "r") as f:
header_code += f.read()
encrypted_code_maps = {}
for file_name in os.listdir(cl_kernel_dir):
file_path = os.path.join(cl_kernel_dir, file_name)
if file_path[-3:] == ".cl":
module_key = get_module_key(file_name)
if len(module_key) > 0:
with open(file_path, "r") as f:
code_str = ""
headers = []
for line in f.readlines():
if "#include <common.h>" in line:
code_str += header_code
headers.append(get_module_key("common.h"))
else:
code_str += line
encrypted_code_arr = encrypt_code(code_str)
encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cpp_cl_encrypted_kernel = env.get_template(
'str2vec_maps.cc.jinja2').render(
maps=encrypted_code_maps,
data_type='unsigned char',
variable_name='kEncryptedProgramMap')
output_dir = os.path.dirname(output_path)
if os.path.exists(output_dir):
if os.path.isdir(output_dir):
try:
shutil.rmtree(output_dir)
except OSError:
raise RuntimeError(
"Cannot delete directory %s due to permission "
"error, inspect and remove manually" % output_dir)
else:
raise RuntimeError(
"Cannot delete non-directory %s, inspect ",
"and remove manually" % output_dir)
os.makedirs(output_dir)
with open(output_path, "w") as w_file:
w_file.write(cpp_cl_encrypted_kernel)
encrypted_code = {}
encrypted_code['headers'] = headers
encrypted_code['code'] = encrypted_code_arr
encrypted_code_maps[module_key] = encrypted_code
create_output_dir(os.path.dirname(output_path))
write_cl_encrypted_kernel_to_file(
encrypted_code_maps, 'str2vec_maps.cc.jinja2', output_path)
output_path_h = output_path.replace('.cc', '.h')
write_cl_encrypted_kernel_to_file(
encrypted_code_maps, 'str2vec_maps.h.jinja2', output_path_h)
print('Generate OpenCL kernel done.')
......
......@@ -14,24 +14,32 @@
// This is a generated file. DO NOT EDIT!
#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include <map>
#include <string>
#include <vector>
namespace mace {
namespace {{tag}} {
extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
{
{% for key, value in maps.items() %}
const std::map<std::string, ClProgramInfo> {{variable_name}} = {
{% for key, encrypted_code in maps.items() %}
{
"{{key}}",
{
{%- for ele in value -%}
{{ele}},
{%- endfor -%}
"{{key}}", {
{
{%- for header in encrypted_code['headers'] -%}
"{{header}}",
{%- endfor -%}
},
{
{%- for ele in encrypted_code['code'] -%}
{{ele}},
{%- endfor -%}
}
}
}, // {{key}}
{% endfor %}
};
} // {{tag}}
} // namespace mace
......@@ -12,23 +12,21 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_BILINEAR_H_
#define MACE_OPS_RESIZE_BILINEAR_H_
// This is a generated file. DO NOT EDIT!
#include "mace/core/types.h"
#include <map>
#include <string>
#include <vector>
namespace mace {
namespace ops {
namespace resize_bilinear {
inline float CalculateResizeScale(index_t in_size,
index_t out_size,
bool align_corners) {
return (align_corners && out_size > 1)
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_bilinear
} // namespace ops
} // namespace mace
namespace {{tag}} {
struct ClProgramInfo {
const std::vector<std::string> headers_;
const std::vector<{{data_type}}> encrypted_code_;
};
#endif // MACE_OPS_RESIZE_BILINEAR_H_
extern const std::map<std::string, ClProgramInfo> {{variable_name}};
} // {{tag}}
} // namespace mace
......@@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
ret = repository_ctx.execute(
["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path],
["test", "-f", "%s/mace/ops/opencl/cl/common.cl" % mace_root_path],
)
if ret.return_code == 0:
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
......@@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
python_bin_path,
"%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path,
"--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path,
"--output_path=%s/encrypt_opencl_kernel" % generated_files_path,
"--output_path=%s/encrypt_opencl_kernel.cc" % generated_files_path,
], quiet = False)
encrypt_opencl_kernel_repository = repository_rule(
......
......@@ -42,7 +42,7 @@ void FilterBufferToImage(int iters,
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
auto transform_func = [&]() {
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context,
net.ws()->GetTensor("Input"),
OpenCLBufferType::IN_OUT_CHANNEL,
......
......@@ -13,8 +13,8 @@
// limitations under the License.
#include "mace/benchmark_utils/test_benchmark.h"
#include "mace/ops/common/pad_type.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/pad.h"
namespace mace {
namespace ops {
......
......@@ -14,7 +14,7 @@
#include "mace/benchmark_utils/test_benchmark.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/pooling.h"
#include "mace/ops/common/pooling_type.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......
......@@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
......@@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
......@@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Transform
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
......
......@@ -45,8 +45,8 @@ void TestBidirectionTransform(const OpenCLBufferType type,
"BtOutput", context.device()->allocator(),
DataTypeToEnum<DstType>::value);
OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_BUFFER, 0, bt_output);
......@@ -54,8 +54,8 @@ void TestBidirectionTransform(const OpenCLBufferType type,
Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<OrgType>::value);
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, output);
......@@ -90,8 +90,8 @@ void TestArgumentTransform(const index_t input_size) {
Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
0, output);
......
......@@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
DtToCLCMDDt(DataTypeToEnum<float>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<float>::value));
DtToCLDt(DataTypeToEnum<float>::value));
built_options.emplace(
"-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value));
DtToCLCMDDt(DataTypeToEnum<float>::value));
}
cl::Kernel kernel;
......
......@@ -16,8 +16,8 @@
#include <string>
#include <vector>
#include "mace/ops/common/pad_type.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/pad.h"
namespace mace {
namespace ops {
......
......@@ -14,8 +14,8 @@
#include <vector>
#include "mace/ops/pooling.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/common/pooling_type.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......
......@@ -14,7 +14,7 @@
#include <vector>
#include "mace/ops/reduce.h"
#include "mace/ops/common/reduce_type.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册