提交 0792637f 编写于 作者: L Liangliang He

Merge branch 'minify_opencl' into 'master'

Minify opencl

See merge request !1104
......@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
enable_language(CUDA)
endif(MACE_ENABLE_CUDA)
if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(ANDROID_ABI STREQUAL "arm64-v8a")
# Use gold linker to avoid linking check of libcdsprpc.so
set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
......
......@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
}
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MyCustomOp<DeviceType::GPU, T> : public Operation {
template<>
class MyCustomOp<DeviceType::GPU, float> : public Operation {
...
};
#endif // MACE_ENABLE_OPENCL
......@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
}
} // namespace ops
......
......@@ -5,7 +5,7 @@ package(
default_visibility = ["//visibility:public"],
)
load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
cc_library(
name = "generated_models",
......@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
cc_library(
name = "generated_opencl",
srcs = ["opencl/encrypt_opencl_kernel.cc"],
hdrs = ["opencl/encrypt_opencl_kernel.h"],
copts = [
"-Werror",
"-Wextra",
......
......@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
std::string key = OpKeyBuilder(op_type)
.Device(device_type)
.TypeConstraint("T", dtype)
.TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
.Build();
if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key;
......
......@@ -39,7 +39,7 @@ class OpConditionContext {
OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default;
void set_operator_def(const OperatorDef* operator_def);
void set_operator_def(const OperatorDef *operator_def);
inline const OperatorDef *operator_def() const {
return operator_def_;
......@@ -49,7 +49,7 @@ class OpConditionContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -110,7 +110,7 @@ class OpConstructContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -166,14 +166,14 @@ class Operation {
explicit Operation(OpConstructContext *context);
virtual ~Operation() = default;
template <typename T>
template<typename T>
inline T GetOptionalArg(const std::string &name,
const T &default_value) const {
MACE_CHECK(operator_def_, "operator_def was null!");
return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
*operator_def_, name, default_value);
}
template <typename T>
template<typename T>
inline std::vector<T> GetRepeatedArgs(
const std::string &name, const std::vector<T> &default_value = {}) const {
MACE_CHECK(operator_def_, "operator_def was null!");
......@@ -240,7 +240,6 @@ class Operation {
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
struct OpRegistrationInfo {
public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
......@@ -290,7 +289,6 @@ class OpConditionBuilder {
OpRegistrationInfo::DataFormatSelector data_format_selector_;
};
class OpRegistryBase {
public:
OpRegistryBase() = default;
......@@ -315,7 +313,7 @@ class OpRegistryBase {
OpConstructContext *context,
DeviceType device_type) const;
template <class DerivedType>
template<class DerivedType>
static std::unique_ptr<Operation> DefaultCreator(
OpConstructContext *context) {
return std::unique_ptr<Operation>(new DerivedType(context));
......@@ -334,6 +332,24 @@ class OpRegistryBase {
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name<device, dt>>)
#define MACE_REGISTER_OP_BY_CLASS( \
op_registry, op_type, class_name, device, dt) \
op_registry->Register(op_type, \
device, \
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name>)
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
op_type, \
DeviceType::GPU, \
DT_FLOAT, \
OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
#else
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
#endif
#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
op_registry->Register(builder)
......
......@@ -18,20 +18,19 @@
#include <fstream>
#include <memory>
#include <mutex> // NOLINT(build/c++11)
#include <sstream>
#include <string>
#include <vector>
#include <utility>
#include "mace/utils/macros.h"
#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include "mace/core/kv_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/utils/macros.h"
#include "mace/utils/tuner.h"
namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) {
switch (error) {
case CL_SUCCESS:
......@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint,
std::shared_ptr<KVStorage> precompiled_binary_storage,
std::shared_ptr<Tuner<uint32_t>> tuner):
std::shared_ptr<Tuner<uint32_t>> tuner) :
cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner),
......@@ -332,7 +331,7 @@ OpenCLRuntime::OpenCLRuntime(
cl_int err;
if (gpu_type_ == GPUType::QUALCOMM_ADRENO
&& opencl_version_ == OpenCLVersion::CL_VER_2_0) {
&& opencl_version_ == OpenCLVersion::CL_VER_2_0) {
std::vector<cl_context_properties> context_properties;
context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties,
......@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
std::vector<cl_context_properties> context_properties = {
CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
};
context_ = std::shared_ptr<cl::Context>(
......@@ -399,7 +398,7 @@ OpenCLRuntime::OpenCLRuntime(
if (cached_binary_platform_info != platform_info_) {
if (precompiled_binary_storage_ == nullptr) {
VLOG(1) << "There is no precompiled OpenCL binary in"
" all OpenCL binary paths.";
" all OpenCL binary paths.";
} else {
if (precompiled_binary_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
......@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true;
}
MaceStatus GetProgramSourceByName(const std::string &program_name,
std::string *source) {
MACE_CHECK_NOTNULL(source);
std::stringstream source_stream;
const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
const auto &it_program = kEncryptedProgramMap.find(program_name);
if (it_program == kEncryptedProgramMap.end()) {
LOG(ERROR) << "Find program " << program_name << " failed.";
return MaceStatus::MACE_RUNTIME_ERROR;
}
const std::vector<std::string> &headers = it_program->second.headers_;
for (const std::string &header : headers) {
const auto &header_program = kEncryptedProgramMap.find(header);
if (header_program == kEncryptedProgramMap.end()) {
LOG(WARNING) << "Program header(" << header << ") is empty.";
continue;
}
const auto &header_source = header_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(header_source.begin(), header_source.end()));
}
const auto &it_source = it_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(it_source.begin(), it_source.end()));
*source = source_stream.str();
return MaceStatus::MACE_SUCCESS;
}
bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from source
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source != kEncryptedProgramMap.end()) {
std::string kernel_source;
MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source);
*program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str());
......
......@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
*net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type);
return MaceStatus::MACE_SUCCESS;
......
......@@ -118,9 +118,21 @@ def mace_version_genrule():
)
def encrypt_opencl_kernel_genrule():
native.genrule(
name = "encrypt_opencl_kernel_gen",
srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
outs = ["opencl/encrypt_opencl_kernel.cc"],
cmd = "cat $(SRCS) > $@;"
)
srcs = [
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
)),
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
)),
]
outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
native.genrule(
name = "encrypt_opencl_kernel_gen",
srcs = srcs,
outs = outs,
cmd = " && ".join([
"cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
for i in range(0, len(outs))
]),
)
......@@ -181,7 +181,6 @@ cc_library(
],
)
cc_library(
name = "internal_ops",
srcs = glob(
......@@ -239,10 +238,10 @@ cc_library(
name = "ops",
srcs = [
"registry/ops_registry.cc",
],
],
hdrs = [
"registry/ops_registry.h",
],
],
copts = [
"-Werror",
"-Wextra",
......
......@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ActivationOp<DeviceType::GPU, T> : public Operation {
template<>
class ActivationOp<DeviceType::GPU, float> : public Operation {
public:
explicit ActivationOp(OpConstructContext *context)
: Operation(context) {
ActivationType type = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"));
auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f));
auto leakyrelu_coefficient = static_cast<T>(
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
auto leakyrelu_coefficient =
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
kernel_ = make_unique<opencl::image::ActivationKernel>(
type, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
}
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
......@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -29,10 +29,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class AddNOp;
template <>
template<>
class AddNOp<DeviceType::CPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
......@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class AddNOp<DeviceType::GPU, T> : public Operation {
template<>
class AddNOp<DeviceType::GPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>();
kernel_ = make_unique<opencl::image::AddNKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
......@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchNormOp<DeviceType::GPU, T> : public Operation {
template<>
class BatchNormOp<DeviceType::GPU, float> : public Operation {
public:
explicit BatchNormOp(OpConstructContext *context)
: Operation(context) {
......@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
kernel_ = make_unique<opencl::image::BatchNormKernel>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
void RegisterBatchNorm(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
}
} // namespace ops
......
......@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
}
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class BatchToSpaceNDOp;
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
}
};
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
template<>
class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float);
......@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
}
} // namespace ops
......
......@@ -34,16 +34,16 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BiasAddOp;
template <>
template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
{}
has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
0)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BiasAddOp<DeviceType::GPU, T> : public Operation {
template<>
class BiasAddOp<DeviceType::GPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
......@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
kernel_ = make_unique<opencl::image::BiasAddKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
......@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -23,10 +23,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class ChannelShuffleOp;
template <typename T>
template<typename T>
class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
......@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const int groups_;
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
template<>
class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterChannelShuffle(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1);
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
index_t channels = op->output_shape(0).dims(3);
index_t channels_per_group = channels / groups;
if (groups % 4 != 0 || channels_per_group % 4 != 0) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_PAD_H_
#define MACE_OPS_PAD_H_
#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
#define MACE_OPS_COMMON_PAD_TYPE_H_
namespace mace {
namespace ops {
......@@ -27,4 +27,4 @@ enum PadType {
} // namespace ops
} // namespace mace
#endif // MACE_OPS_PAD_H_
#endif // MACE_OPS_COMMON_PAD_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_POOLING_H_
#define MACE_OPS_POOLING_H_
#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
#define MACE_OPS_COMMON_POOLING_TYPE_H_
namespace mace {
......@@ -23,4 +23,4 @@ enum PoolingType {
};
} // namespace mace
#endif // MACE_OPS_POOLING_H_
#endif // MACE_OPS_COMMON_POOLING_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REDUCE_H_
#define MACE_OPS_REDUCE_H_
#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
#define MACE_OPS_COMMON_REDUCE_TYPE_H_
namespace mace {
......@@ -28,4 +28,4 @@ enum ReduceType {
};
} // namespace mace
#endif // MACE_OPS_REDUCE_H_
#endif // MACE_OPS_COMMON_REDUCE_TYPE_H_
......@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_BICUBIC_H_
#define MACE_OPS_RESIZE_BICUBIC_H_
#ifndef MACE_OPS_COMMON_UTILS_H_
#define MACE_OPS_COMMON_UTILS_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
namespace resize_bicubic {
namespace common {
namespace utils {
constexpr int64_t kTableSize = (1u << 10);
inline float CalculateResizeScale(index_t in_size,
......@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_bicubic
} // namespace utils
} // namespace common
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_BICUBIC_H_
#endif // MACE_OPS_COMMON_UTILS_H_
......@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
int axis_;
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class ConcatOp;
template <typename T>
template<typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
......@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
template<>
class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
kernel_ = make_unique<opencl::image::ConcatKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterConcat(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float);
......@@ -228,51 +227,44 @@ void RegisterConcat(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Concat")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
} else {
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
if (!has_data_format || axis != 3) {
return { DeviceType::CPU };
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return {DeviceType::CPU, DeviceType::GPU};
}
bool divisible_four = true;
for (const std::string &input : op->input()) {
if (tensor_shape_info->find(input)
!= tensor_shape_info->end()) {
divisible_four = divisible_four
&& (tensor_shape_info->at(input)[3] % 4 == 0);
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return {DeviceType::CPU};
} else {
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
if (!has_data_format || axis != 3) {
return {DeviceType::CPU};
}
bool divisible_four = true;
for (const std::string &input : op->input()) {
if (tensor_shape_info->find(input)
!= tensor_shape_info->end()) {
divisible_four = divisible_four
&& (tensor_shape_info->at(input)[3] % 4 == 0);
}
}
// Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) {
return {DeviceType::CPU};
}
}
// Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) {
return { DeviceType::CPU };
}
}
return { DeviceType::CPU, DeviceType::GPU };
}));
return {DeviceType::CPU, DeviceType::GPU};
}));
}
} // namespace ops
......
......@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
template<>
class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
public:
explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context),
......@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Conv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
}
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
......@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
}
} // namespace ops
......
......@@ -24,10 +24,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class CropOp;
template <class T>
template<class T>
class CropOp<DeviceType::CPU, T> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
......@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
......@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_data = output->mutable_data<T>();
const T * input_data = input0->data<T>();
const T *input_data = input0->data<T>();
crop_copy(input_data, output_data, input0->shape(),
output_shape, offsets.data());
......@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
private:
void crop_copy(const T* input_data, T* output_data,
void crop_copy(const T *input_data, T *output_data,
const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape,
const int32_t* offsets) {
const int32_t *offsets) {
const index_t out_img_size =
output_shape[1] * output_shape[2] * output_shape[3];
const index_t out_hw = output_shape[2] * output_shape[3];
......@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
for (int b = 0; b < output_shape[0]; ++b) {
for (int c = 0; c < output_shape[1]; ++c) {
for (int h = 0; h < output_shape[2]; ++h) {
T* out_ptr =
T *out_ptr =
output_data + b * out_img_size + c * out_hw + h * output_shape[3];
const T* in_ptr_bch =
const T *in_ptr_bch =
input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3];
......@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class CropOp<DeviceType::GPU, T> : public Operation {
template<>
class CropOp<DeviceType::GPU, float> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>(
kernel_ = make_unique<opencl::image::CropKernel>(
Operation::GetRepeatedArgs<int>("offset"));
} else {
MACE_NOT_IMPLEMENTED;
......@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
......@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Deconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else {
if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
3,
......@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
......
......@@ -24,7 +24,7 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class DepthToSpaceOp : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
......@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
template<>
class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
}
} // namespace ops
......
......@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
}
Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) {
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
......@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type) == MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, float);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
......@@ -467,8 +464,8 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DataFormat op_data_format =
static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
}));
}
......
......@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
......@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
}
} // namespace ops
......
......@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class EltwiseOp<DeviceType::GPU, T> : public Operation {
template<>
class EltwiseOp<DeviceType::GPU, float> : public Operation {
public:
explicit EltwiseOp(OpConstructContext *context)
: Operation(context) {
......@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
kernel_ = make_unique<opencl::image::EltwiseKernel>(
type, coeff, scalar_input, scalar_input_index);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) {
if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
} else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
}
} // namespace ops
......
......@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
template<>
class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
public:
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
OpenCLBufferType::WEIGHT_WIDTH,
mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
FullyConnectedOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
}
} // namespace ops
......
......@@ -18,7 +18,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class IdentityOp : public Operation {
public:
explicit IdentityOp(OpConstructContext *context)
......@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
};
void RegisterIdentity(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -19,7 +19,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class InferConv2dShapeOp : public Operation {
public:
explicit InferConv2dShapeOp(OpConstructContext *context)
......@@ -66,20 +65,23 @@ class InferConv2dShapeOp : public Operation {
int32_t out_h = 0, out_w = 0;
if (!paddings.empty()) {
out_h = (in_h - kernels[2] + paddings[0]) / strides[0] + 1;
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
} else {
switch (padding_type) {
case SAME:
case SAME: {
out_h = (in_h + strides[0] - 1) / strides[0];
out_w = (in_w + strides[1] - 1) / strides[1];
break;
case VALID:
}
case VALID: {
out_h = (in_h - kernels[2] + 1) / strides[0];
out_w = (in_w - kernels[3] + 1) / strides[1];
break;
default:
}
default: {
MACE_NOT_IMPLEMENTED;
break;
}
}
}
......@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
};
void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, half);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -77,7 +77,7 @@ class MatMulOpBase : public Operation {
} else {
MACE_CHECK(lhs_rank == 2 || rhs_rank == 2,
"Either lhs or rhs matrix should has rank 2 "
"for non-batched matrix multiplication");
"for non-batched matrix multiplication");
}
index_t
......@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
template<>
class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) {
......@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
};
#endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
......@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
......@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t size = input->dim(0);
......@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
return MaceStatus::MACE_SUCCESS;
}
MaceStatus BufferTransform::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, output);
default:
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
class BufferTransform : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
OpContext *context,
......@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, dt, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input, dt, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
......@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(kwg_size_);
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
StatsFuture *future);
} // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
Conv2dKernel() : old_scratch_size_(0) {}
......@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string pad_data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
std::string out_data_dt = DtToCLDt(output->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
}
} // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
StatsFuture *future);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {}
......@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
auto input_dtype = input->dtype();
auto input_dt = DtToCLDt(input_dtype);
built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
auto output_dtype = output->dtype();
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
if (pooling_type == MAX && input_dtype == output_dtype) {
built_options.emplace("-DDATA_TYPE=" + input_dt);
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
PoolingKernel() : old_scratch_size_(0) {}
......@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -29,7 +29,7 @@ namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
explicit SoftmaxKernel(bool use_log)
......@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -20,11 +20,11 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BufferTransformOp;
template <typename T>
class BufferTransformOp<DeviceType::GPU, T> : public Operation {
template<>
class BufferTransformOp<DeviceType::GPU, float> : public Operation {
public:
explicit BufferTransformOp(OpConstructContext *context)
: Operation(context),
......@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
}
......@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType out_mem_type_;
};
void RegisterBufferTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, half);
MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
}
} // namespace ops
......
......@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
return name + postfix;
}
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size) {
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
const DataType dt = input->dtype();
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops
} // namespace mace
......@@ -28,17 +28,16 @@
namespace mace {
namespace ops {
// Only used for GPU Operation(BufferTransform)
template<typename T>
class OpenCLBufferTransformer {
public:
OpenCLBufferTransformer(const MemoryType in_mem_type,
const MemoryType out_mem_type) {
if (out_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BufferToImage<T>>();
kernel_ = make_unique<opencl::image::BufferToImage>();
} else if (in_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
kernel_ = make_unique<opencl::image::ImageToBuffer>();
} else {
kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
kernel_ = make_unique<opencl::buffer::BufferTransform>();
}
}
......@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
const int wino_blk_size,
Tensor *output) {
Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value;
DataType dt = output->dtype();
MemoryType in_mem_type = input->memory_type();
if (out_mem_type == MemoryType::GPU_IMAGE ||
out_mem_type == MemoryType::GPU_BUFFER) {
......@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
<< " to CPU Buffer " << output->name()
<< " with data type " << dt;
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
float *output_ptr = output->mutable_data<float>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
} else {
LOG(FATAL) << "Unexpected error: " << out_mem_type;
......@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
std::string TransformedFilterName(const std::string &name);
template<typename T>
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size = 0) {
const DataType dt = DataTypeToEnum<T>::value;
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
const int wino_blk_size = 0);
} // namespace ops
} // namespace mace
......
......@@ -17,8 +17,9 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
class OpContext;
......
......@@ -17,7 +17,10 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/core/types.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
namespace mace {
......
......@@ -19,6 +19,9 @@
#include <vector>
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/core/types.h"
namespace mace {
......
......@@ -15,8 +15,7 @@
#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/math.h"
......
......@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
}
}
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
......
......@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
// CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select
// e.g. half -> float
std::string DtToCLCondDt(const DataType dt);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/activation.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ActivationKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
switch (activation_) {
case RELU: {
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
}
case RELUX: {
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
}
case PRELU: {
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
}
case TANH: {
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
}
case SIGMOID: {
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
}
case LEAKYRELU: {
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation_;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,12 +31,11 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel {
public:
ActivationKernel(ActivationType type,
T relux_max_limit,
T leakyrelu_coefficient)
float relux_max_limit,
float leakyrelu_coefficient)
: activation_(type), relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
......@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
private:
ActivationType activation_;
T relux_max_limit_;
T leakyrelu_coefficient_;
float relux_max_limit_;
float leakyrelu_coefficient_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
std::string tuning_key_prefix_;
};
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/addn.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus AddNKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel {
public:
MaceStatus Compute(
......@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_norm.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
BatchNormKernel::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
MaceStatus BatchNormKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
break;
case TANH:built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel {
public:
BatchNormKernel(
......@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BatchToSpaceKernel::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = batch_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public:
MaceStatus Compute(
......@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/bias_add.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BiasAddKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel {
public:
MaceStatus Compute(
......@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BufferToImage::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BufferToImage : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
......@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:
kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/channel_shuffle.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ChannelShuffleKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
......@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
auto data_dt = input0->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
......@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
......@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
......@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
}
} // namespace concat
MaceStatus ConcatKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
&input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context,
&kernel_,
input_list,
output,
&kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size);
} // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel {
public:
ConcatKernel() {}
......@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, &kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册