提交 0792637f 编写于 作者: L Liangliang He

Merge branch 'minify_opencl' into 'master'

Minify opencl

See merge request !1104
......@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
enable_language(CUDA)
endif(MACE_ENABLE_CUDA)
if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(ANDROID_ABI STREQUAL "arm64-v8a")
# Use gold linker to avoid linking check of libcdsprpc.so
set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
......
......@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
}
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MyCustomOp<DeviceType::GPU, T> : public Operation {
template<>
class MyCustomOp<DeviceType::GPU, float> : public Operation {
...
};
#endif // MACE_ENABLE_OPENCL
......@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
}
} // namespace ops
......
......@@ -5,7 +5,7 @@ package(
default_visibility = ["//visibility:public"],
)
load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
cc_library(
name = "generated_models",
......@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
cc_library(
name = "generated_opencl",
srcs = ["opencl/encrypt_opencl_kernel.cc"],
hdrs = ["opencl/encrypt_opencl_kernel.h"],
copts = [
"-Werror",
"-Wextra",
......
......@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
std::string key = OpKeyBuilder(op_type)
.Device(device_type)
.TypeConstraint("T", dtype)
.TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
.Build();
if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key;
......
......@@ -39,7 +39,7 @@ class OpConditionContext {
OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default;
void set_operator_def(const OperatorDef* operator_def);
void set_operator_def(const OperatorDef *operator_def);
inline const OperatorDef *operator_def() const {
return operator_def_;
......@@ -49,7 +49,7 @@ class OpConditionContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -110,7 +110,7 @@ class OpConstructContext {
return ws_;
}
inline void set_device(Device* device) {
inline void set_device(Device *device) {
device_ = device;
}
......@@ -166,14 +166,14 @@ class Operation {
explicit Operation(OpConstructContext *context);
virtual ~Operation() = default;
template <typename T>
template<typename T>
inline T GetOptionalArg(const std::string &name,
const T &default_value) const {
MACE_CHECK(operator_def_, "operator_def was null!");
return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
*operator_def_, name, default_value);
}
template <typename T>
template<typename T>
inline std::vector<T> GetRepeatedArgs(
const std::string &name, const std::vector<T> &default_value = {}) const {
MACE_CHECK(operator_def_, "operator_def was null!");
......@@ -240,7 +240,6 @@ class Operation {
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
struct OpRegistrationInfo {
public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
......@@ -290,7 +289,6 @@ class OpConditionBuilder {
OpRegistrationInfo::DataFormatSelector data_format_selector_;
};
class OpRegistryBase {
public:
OpRegistryBase() = default;
......@@ -315,7 +313,7 @@ class OpRegistryBase {
OpConstructContext *context,
DeviceType device_type) const;
template <class DerivedType>
template<class DerivedType>
static std::unique_ptr<Operation> DefaultCreator(
OpConstructContext *context) {
return std::unique_ptr<Operation>(new DerivedType(context));
......@@ -334,6 +332,24 @@ class OpRegistryBase {
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name<device, dt>>)
#define MACE_REGISTER_OP_BY_CLASS( \
op_registry, op_type, class_name, device, dt) \
op_registry->Register(op_type, \
device, \
DataTypeToEnum<dt>::value, \
OpRegistryBase::DefaultCreator<class_name>)
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
op_type, \
DeviceType::GPU, \
DT_FLOAT, \
OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
#else
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
#endif
#define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
op_registry->Register(builder)
......
......@@ -18,20 +18,19 @@
#include <fstream>
#include <memory>
#include <mutex> // NOLINT(build/c++11)
#include <sstream>
#include <string>
#include <vector>
#include <utility>
#include "mace/utils/macros.h"
#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
#include "mace/core/kv_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/utils/macros.h"
#include "mace/utils/tuner.h"
namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) {
switch (error) {
case CL_SUCCESS:
......@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint,
std::shared_ptr<KVStorage> precompiled_binary_storage,
std::shared_ptr<Tuner<uint32_t>> tuner):
std::shared_ptr<Tuner<uint32_t>> tuner) :
cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner),
......@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
std::vector<cl_context_properties> context_properties = {
CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
};
context_ = std::shared_ptr<cl::Context>(
......@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true;
}
MaceStatus GetProgramSourceByName(const std::string &program_name,
std::string *source) {
MACE_CHECK_NOTNULL(source);
std::stringstream source_stream;
const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
const auto &it_program = kEncryptedProgramMap.find(program_name);
if (it_program == kEncryptedProgramMap.end()) {
LOG(ERROR) << "Find program " << program_name << " failed.";
return MaceStatus::MACE_RUNTIME_ERROR;
}
const std::vector<std::string> &headers = it_program->second.headers_;
for (const std::string &header : headers) {
const auto &header_program = kEncryptedProgramMap.find(header);
if (header_program == kEncryptedProgramMap.end()) {
LOG(WARNING) << "Program header(" << header << ") is empty.";
continue;
}
const auto &header_source = header_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(header_source.begin(), header_source.end()));
}
const auto &it_source = it_program->second.encrypted_code_;
source_stream << ObfuscateString(
std::string(it_source.begin(), it_source.end()));
*source = source_stream.str();
return MaceStatus::MACE_SUCCESS;
}
bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from source
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source != kEncryptedProgramMap.end()) {
std::string kernel_source;
MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source);
*program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str());
......
......@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
*net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type);
return MaceStatus::MACE_SUCCESS;
......
......@@ -118,9 +118,21 @@ def mace_version_genrule():
)
def encrypt_opencl_kernel_genrule():
srcs = [
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
)),
str(Label(
"@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
)),
]
outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
native.genrule(
name = "encrypt_opencl_kernel_gen",
srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
outs = ["opencl/encrypt_opencl_kernel.cc"],
cmd = "cat $(SRCS) > $@;"
srcs = srcs,
outs = outs,
cmd = " && ".join([
"cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
for i in range(0, len(outs))
]),
)
......@@ -181,7 +181,6 @@ cc_library(
],
)
cc_library(
name = "internal_ops",
srcs = glob(
......
......@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ActivationOp<DeviceType::GPU, T> : public Operation {
template<>
class ActivationOp<DeviceType::GPU, float> : public Operation {
public:
explicit ActivationOp(OpConstructContext *context)
: Operation(context) {
ActivationType type = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"));
auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f));
auto leakyrelu_coefficient = static_cast<T>(
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
auto leakyrelu_coefficient =
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
kernel_ = make_unique<opencl::image::ActivationKernel>(
type, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
}
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
......@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -29,10 +29,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class AddNOp;
template <>
template<>
class AddNOp<DeviceType::CPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
......@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class AddNOp<DeviceType::GPU, T> : public Operation {
template<>
class AddNOp<DeviceType::GPU, float> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>();
kernel_ = make_unique<opencl::image::AddNKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
......@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchNormOp<DeviceType::GPU, T> : public Operation {
template<>
class BatchNormOp<DeviceType::GPU, float> : public Operation {
public:
explicit BatchNormOp(OpConstructContext *context)
: Operation(context) {
......@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
kernel_ = make_unique<opencl::image::BatchNormKernel>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
void RegisterBatchNorm(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
}
} // namespace ops
......
......@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
}
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class BatchToSpaceNDOp;
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
}
};
template <>
template<>
class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
......@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
template<>
class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float);
......@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
}
} // namespace ops
......
......@@ -34,16 +34,16 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BiasAddOp;
template <>
template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
{}
has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
0)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class BiasAddOp<DeviceType::GPU, T> : public Operation {
template<>
class BiasAddOp<DeviceType::GPU, float> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
......@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
kernel_ = make_unique<opencl::image::BiasAddKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
......@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -23,10 +23,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class ChannelShuffleOp;
template <typename T>
template<typename T>
class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
......@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const int groups_;
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
template<>
class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
public:
explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterChannelShuffle(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1);
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
index_t channels = op->output_shape(0).dims(3);
index_t channels_per_group = channels / groups;
if (groups % 4 != 0 || channels_per_group % 4 != 0) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_PAD_H_
#define MACE_OPS_PAD_H_
#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
#define MACE_OPS_COMMON_PAD_TYPE_H_
namespace mace {
namespace ops {
......@@ -27,4 +27,4 @@ enum PadType {
} // namespace ops
} // namespace mace
#endif // MACE_OPS_PAD_H_
#endif // MACE_OPS_COMMON_PAD_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_POOLING_H_
#define MACE_OPS_POOLING_H_
#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
#define MACE_OPS_COMMON_POOLING_TYPE_H_
namespace mace {
......@@ -23,4 +23,4 @@ enum PoolingType {
};
} // namespace mace
#endif // MACE_OPS_POOLING_H_
#endif // MACE_OPS_COMMON_POOLING_TYPE_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REDUCE_H_
#define MACE_OPS_REDUCE_H_
#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
#define MACE_OPS_COMMON_REDUCE_TYPE_H_
namespace mace {
......@@ -28,4 +28,4 @@ enum ReduceType {
};
} // namespace mace
#endif // MACE_OPS_REDUCE_H_
#endif // MACE_OPS_COMMON_REDUCE_TYPE_H_
......@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_RESIZE_BICUBIC_H_
#define MACE_OPS_RESIZE_BICUBIC_H_
#ifndef MACE_OPS_COMMON_UTILS_H_
#define MACE_OPS_COMMON_UTILS_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
namespace resize_bicubic {
namespace common {
namespace utils {
constexpr int64_t kTableSize = (1u << 10);
inline float CalculateResizeScale(index_t in_size,
......@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
? (in_size - 1) / static_cast<float>(out_size - 1)
: in_size / static_cast<float>(out_size);
}
} // namespace resize_bicubic
} // namespace utils
} // namespace common
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_BICUBIC_H_
#endif // MACE_OPS_COMMON_UTILS_H_
......@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
int axis_;
};
template <DeviceType D, class T>
template<DeviceType D, class T>
class ConcatOp;
template <typename T>
template<typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
......@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
template<>
class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
kernel_ = make_unique<opencl::image::ConcatKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
};
#endif // MACE_ENABLE_OPENCL
void RegisterConcat(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float);
......@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
} else {
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
......@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
if (!has_data_format || axis != 3) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
bool divisible_four = true;
for (const std::string &input : op->input()) {
......@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) {
}
// Only support not divisible 4 case with 2 inputs.
if (op->input_size() > 2 && !divisible_four) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
template<>
class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
public:
explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context),
......@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Conv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
}
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
......@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
}
} // namespace ops
......
......@@ -24,10 +24,10 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class CropOp;
template <class T>
template<class T>
class CropOp<DeviceType::CPU, T> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
......@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
......@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
T *output_data = output->mutable_data<T>();
const T * input_data = input0->data<T>();
const T *input_data = input0->data<T>();
crop_copy(input_data, output_data, input0->shape(),
output_shape, offsets.data());
......@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
}
private:
void crop_copy(const T* input_data, T* output_data,
void crop_copy(const T *input_data, T *output_data,
const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape,
const int32_t* offsets) {
const int32_t *offsets) {
const index_t out_img_size =
output_shape[1] * output_shape[2] * output_shape[3];
const index_t out_hw = output_shape[2] * output_shape[3];
......@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
for (int b = 0; b < output_shape[0]; ++b) {
for (int c = 0; c < output_shape[1]; ++c) {
for (int h = 0; h < output_shape[2]; ++h) {
T* out_ptr =
T *out_ptr =
output_data + b * out_img_size + c * out_hw + h * output_shape[3];
const T* in_ptr_bch =
const T *in_ptr_bch =
input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3];
......@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class CropOp<DeviceType::GPU, T> : public Operation {
template<>
class CropOp<DeviceType::GPU, float> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
: Operation(context) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>(
kernel_ = make_unique<opencl::image::CropKernel>(
Operation::GetRepeatedArgs<int>("offset"));
} else {
MACE_NOT_IMPLEMENTED;
......@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
};
#endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
......@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
return {DeviceType::CPU};
}
return { DeviceType::CPU, DeviceType::GPU };
return {DeviceType::CPU, DeviceType::GPU};
}));
}
......
......@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::Deconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else {
if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
3,
......@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
......
......@@ -24,7 +24,7 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class DepthToSpaceOp : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
......@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
template<>
class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
}
} // namespace ops
......
......@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
}
Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) {
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
......@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type) == MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, float);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
......
......@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
template<>
class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
......@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
}
} // namespace ops
......
......@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class EltwiseOp<DeviceType::GPU, T> : public Operation {
template<>
class EltwiseOp<DeviceType::GPU, float> : public Operation {
public:
explicit EltwiseOp(OpConstructContext *context)
: Operation(context) {
......@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
MemoryType mem_type;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
kernel_ = make_unique<opencl::image::EltwiseKernel>(
type, coeff, scalar_input, scalar_input_index);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) {
if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
} else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
i,
......@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
}
} // namespace ops
......
......@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
template<>
class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
public:
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
} else {
MACE_NOT_IMPLEMENTED;
}
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context,
operator_def_.get(),
1,
OpenCLBufferType::WEIGHT_WIDTH,
mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
MACE_CHECK(TransformFilter(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
......@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
FullyConnectedOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
}
} // namespace ops
......
......@@ -18,7 +18,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class IdentityOp : public Operation {
public:
explicit IdentityOp(OpConstructContext *context)
......@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
};
void RegisterIdentity(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -19,7 +19,6 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
class InferConv2dShapeOp : public Operation {
public:
explicit InferConv2dShapeOp(OpConstructContext *context)
......@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation {
out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
} else {
switch (padding_type) {
case SAME:
case SAME: {
out_h = (in_h + strides[0] - 1) / strides[0];
out_w = (in_w + strides[1] - 1) / strides[1];
break;
case VALID:
}
case VALID: {
out_h = (in_h - kernels[2] + 1) / strides[0];
out_w = (in_w - kernels[3] + 1) / strides[1];
break;
default:
}
default: {
MACE_NOT_IMPLEMENTED;
break;
}
}
}
if (isNCHW) {
output_data[0] = out_batch;
......@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
};
void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
template<>
class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) {
......@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
};
#endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
......@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
......@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1);
......@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
const index_t size = input->dim(0);
......@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
std::string data_dt = DtToCLDt(input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DDATA_TYPE=" + data_dt);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
return MaceStatus::MACE_SUCCESS;
}
MaceStatus BufferTransform::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, output);
default:
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformDWConv2DFilter(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
MaceStatus TransformArgument(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output);
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
class BufferTransform : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
OpContext *context,
......@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input, dt, output);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input, dt, output);
} else {
SetFutureDefaultWaitFn(context->future());
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
......@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(kwg_size_);
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
StatsFuture *future);
} // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
Conv2dKernel() : old_scratch_size_(0) {}
......@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
bool Conv2dKernel<T>::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<index_t> &filter_shape,
const std::vector<index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_block_size) {
MACE_UNUSED(runtime);
MACE_UNUSED(output_shape);
MACE_UNUSED(wino_block_size);
return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
strides[0] == 1 && strides[1] == 1 &&
dilations[0] == 1 && dilations[1] == 1);
}
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int winograd_blk_size,
Tensor *output) {
MACE_UNUSED(winograd_blk_size);
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
std::string pad_data_dt = DtToCLDt(padded_input->dtype());
built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
std::string out_data_dt = DtToCLDt(output->dtype());
built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......
......@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
}
} // namespace depthwise
MaceStatus DepthwiseConv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
......@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
StatsFuture *future);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {}
......@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
Tensor *output) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
leakyrelu_coefficient, input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/pooling.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus PoolingKernel::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
auto input_dtype = input->dtype();
auto input_dt = DtToCLDt(input_dtype);
built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
auto output_dtype = output->dtype();
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
if (pooling_type == MAX && input_dtype == output_dtype) {
built_options.emplace("-DDATA_TYPE=" + input_dt);
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
PoolingKernel() : old_scratch_size_(0) {}
......@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const RoundType round_type,
Tensor *output) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, round_type,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
input->dtype());
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/buffer/softmax.h"
namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
MaceStatus SoftmaxKernel::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -29,7 +29,7 @@ namespace mace {
namespace ops {
namespace opencl {
namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
explicit SoftmaxKernel(bool use_log)
......@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpContext *context,
const Tensor *logits,
Tensor *output) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace ops
......
......@@ -20,11 +20,11 @@
namespace mace {
namespace ops {
template <DeviceType D, class T>
template<DeviceType D, class T>
class BufferTransformOp;
template <typename T>
class BufferTransformOp<DeviceType::GPU, T> : public Operation {
template<>
class BufferTransformOp<DeviceType::GPU, float> : public Operation {
public:
explicit BufferTransformOp(OpConstructContext *context)
: Operation(context),
......@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
}
......@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
MemoryType out_mem_type_;
};
void RegisterBufferTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferTransform",
BufferTransformOp, DeviceType::GPU, half);
MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
}
} // namespace ops
......
......@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
return name + postfix;
}
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size) {
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
const DataType dt = input->dtype();
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
} // namespace ops
} // namespace mace
......@@ -28,17 +28,16 @@
namespace mace {
namespace ops {
// Only used for GPU Operation(BufferTransform)
template<typename T>
class OpenCLBufferTransformer {
public:
OpenCLBufferTransformer(const MemoryType in_mem_type,
const MemoryType out_mem_type) {
if (out_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BufferToImage<T>>();
kernel_ = make_unique<opencl::image::BufferToImage>();
} else if (in_mem_type == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
kernel_ = make_unique<opencl::image::ImageToBuffer>();
} else {
kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
kernel_ = make_unique<opencl::buffer::BufferTransform>();
}
}
......@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
const int wino_blk_size,
Tensor *output) {
Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value;
DataType dt = output->dtype();
MemoryType in_mem_type = input->memory_type();
if (out_mem_type == MemoryType::GPU_IMAGE ||
out_mem_type == MemoryType::GPU_BUFFER) {
......@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
<< " to CPU Buffer " << output->name()
<< " with data type " << dt;
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
float *output_ptr = output->mutable_data<float>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
} else {
LOG(FATAL) << "Unexpected error: " << out_mem_type;
......@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
std::string TransformedFilterName(const std::string &name);
template<typename T>
MaceStatus TransformFilter(
mace::OpConstructContext *context,
OperatorDef *op_def,
const int input_idx,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
const int wino_blk_size = 0) {
const DataType dt = DataTypeToEnum<T>::value;
OpContext op_context(context->workspace(), context->device());
Workspace *ws = context->workspace();
std::string input_name = op_def->input(input_idx);
Tensor *input = ws->GetTensor(input_name);
std::string output_name = TransformedFilterName(input_name);
Tensor *output =
ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
// update the information
op_def->set_input(input_idx, output_name);
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
output);
}
const int wino_blk_size = 0);
} // namespace ops
} // namespace mace
......
......@@ -17,8 +17,9 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
class OpContext;
......
......@@ -17,7 +17,10 @@
#include <vector>
#include "mace/ops/activation.h"
#include "mace/core/types.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
namespace mace {
......
......@@ -19,6 +19,9 @@
#include <vector>
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/core/types.h"
namespace mace {
......
......@@ -15,8 +15,7 @@
#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/public/mace.h"
#include "mace/utils/math.h"
......
......@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
}
}
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "float";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
......
......@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
// CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select
// e.g. half -> float
std::string DtToCLCondDt(const DataType dt);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/activation.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ActivationKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
switch (activation_) {
case RELU: {
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
}
case RELUX: {
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
}
case PRELU: {
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
}
case TANH: {
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
}
case SIGMOID: {
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
}
case LEAKYRELU: {
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation_;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -31,12 +31,11 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel {
public:
ActivationKernel(ActivationType type,
T relux_max_limit,
T leakyrelu_coefficient)
float relux_max_limit,
float leakyrelu_coefficient)
: activation_(type), relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
......@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
private:
ActivationType activation_;
T relux_max_limit_;
T leakyrelu_coefficient_;
float relux_max_limit_;
float leakyrelu_coefficient_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
std::string tuning_key_prefix_;
};
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
tuning_key_prefix_ = "leakyrelu_opencl_kernel";
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/addn.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus AddNKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel {
public:
MaceStatus Compute(
......@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_norm.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
BatchNormKernel::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
MaceStatus BatchNormKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:break;
case RELU:built_options.emplace("-DUSE_RELU");
break;
case RELUX:built_options.emplace("-DUSE_RELUX");
break;
case TANH:built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
break;
default:LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/activation.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
......@@ -31,7 +31,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel {
public:
BatchNormKernel(
......@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const float epsilon,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient)
: epsilon_(epsilon),
activation_(activation),
relux_max_limit_(relux_max_limit),
leakyrelu_coefficient_(leakyrelu_coefficient) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
bool not_folded = (mean != nullptr && var != nullptr);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (!not_folded) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (not_folded) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon_);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, leakyrelu_coefficient_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/batch_to_space.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BatchToSpaceKernel::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = batch_tensor->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public:
MaceStatus Compute(
......@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor) {
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/bias_add.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BiasAddKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel {
public:
MaceStatus Compute(
......@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus BufferToImage::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
auto input_dt = input->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class BufferToImage : public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
......@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpContext *context,
const Tensor *input,
const OpenCLBufferType type,
const int wino_blk_size,
Tensor *output) {
auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
type,
&image_shape,
wino_blk_size);
MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = "arg_buffer_to_image";
break;
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
kernel_name = "in_out_width_buffer_to_image";
break;
case WEIGHT_HEIGHT:
kernel_name = "weight_height_buffer_to_image";
break;
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (input->dtype() == output->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/channel_shuffle.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
MaceStatus ChannelShuffleKernel::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
......@@ -30,7 +30,6 @@ namespace ops {
namespace opencl {
namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
......@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_CHECK(input->dim(3) % groups_ == 0,
"input channels must be an integral multiple of group. ",
input->dim(3));
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
auto data_dt = input0->dtype();
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
......@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
......@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
......@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
}
} // namespace concat
MaceStatus ConcatKernel::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
&input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context,
&kernel_,
input_list,
output,
&kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
......@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size);
......@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
MaceStatus ConcatN(OpContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
uint32_t *kwg_size);
} // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel {
public:
ConcatKernel() {}
......@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
const Tensor *input0 = input_list[0];
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
default:
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, &kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace ops
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/image/conv_2d.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
bool Conv2dKernel::CheckUseWinograd(
OpenCLRuntime *runtime,
const std::vector<mace::index_t> &filter_shape,
const std::vector<mace::index_t> &output_shape,
const int *strides,
const int *dilations,
int *wino_blk_size) {
if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
strides[0] > 1 || strides[1] > 1 ||
dilations[0] > 1 || dilations[1] > 1) {
return false;
}
index_t out_channels = filter_shape[0];
index_t in_channels = filter_shape[1];
auto opencl_image_max_size = runtime->GetMaxImage2DSize();
auto check_opencl_limit = [&](int block_size) -> bool {
int sqr_block = (block_size + 2) * (block_size + 2);
uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
((output_shape[1] + block_size - 1) / block_size) *
((output_shape[2] + block_size - 1) / block_size));
return (transformed_width < opencl_image_max_size[0] &&
static_cast<uint64_t>(sqr_block * in_channels)
< opencl_image_max_size[1] &&
static_cast<uint64_t>(sqr_block * out_channels)
< opencl_image_max_size[1]);
};
// GPU only supports 4x4 and 2x2 gpu winograd convolution
if (*wino_blk_size == 4) {
// if block size == 4 exceed OpenCL image size limitation, fallback to 2
if (!check_opencl_limit(4)) {
*wino_blk_size = 2;
} else {
return true;
}
}
return check_opencl_limit(2);
}
MaceStatus Conv2dKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const int wino_blk_size,
Tensor *output) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
ops::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
std::function<MaceStatus()> conv_func;
if (wino_blk_size != 0) {
// use winograd covolution
conv_func = [&]() -> MaceStatus {
cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
return WinogradConv2dK3x3S1(context,
kernels,
input,
filter,
bias,
paddings.data(),
activation,
relux_max_limit,
leakyrelu_coefficient,
wino_blk_size,
&input_shape_,
output,
kwg_size);
};
} else if (kernel_h == 1 && kernel_w == 1) {
conv_func = [&]() -> MaceStatus {
return Conv2dK1x1(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else if (kernel_h == 3 && kernel_w == 3) {
conv_func = [&]() -> MaceStatus {
return Conv2dK3x3(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
} else {
conv_func = [&]() -> MaceStatus {
return Conv2d(context,
&kernels_[0],
input,
filter,
bias,
strides[0],
paddings.data(),
dilations,
activation,
relux_max_limit,
leakyrelu_coefficient,
&input_shape_,
output,
&kwg_size_[0]);
};
}
return conv_func();
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
此差异已折叠。
......@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace
extern MaceStatus Conv2dK1x1(OpContext *context,
MaceStatus Conv2dK1x1(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
......@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
built_options, kernel));
......
......@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace
extern MaceStatus Conv2dK3x3(OpContext *context,
MaceStatus Conv2dK3x3(OpContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
......@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
const ActivationType activation,
const float relux_max_limit,
const float leakyrelu_coefficient,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
uint32_t *kwg_size) {
......@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
case NOOP: {
break;
case RELU:
}
case RELU: {
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
}
case RELUX: {
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
}
case TANH: {
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
}
case SIGMOID: {
built_options.emplace("-DUSE_SIGMOID");
break;
case LEAKYRELU:
}
case LEAKYRELU: {
built_options.emplace("-DUSE_LEAKYRELU");
break;
default:
}
default: {
LOG(FATAL) << "Unknown activation type: " << activation;
}
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
built_options, kernel));
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册