提交 3ba07e0b 编写于 作者: L liuqi

Change DeviceType::OPENCL to DeviceType::GPU and fix some bugs.

上级 d98386d6
...@@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) { ...@@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) {
Custom_Op<DeviceType::CPU, float>); Custom_Op<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name") REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Custom_Op<DeviceType::OPENCL, float>); Custom_Op<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name") REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
Custom_Op<DeviceType::OPENCL, half>); Custom_Op<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data ...@@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
//3. 声明设备类型 //3. 声明设备类型
DeviceType device_type = DeviceType::OPENCL; DeviceType device_type = DeviceType::GPU;
//4. 定义输入输出名称数组 //4. 定义输入输出名称数组
std::vector<std::string> input_names = {...}; std::vector<std::string> input_names = {...};
...@@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) { ...@@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) {
//6. 创建MaceEngine对象 //6. 创建MaceEngine对象
mace::MaceEngine engine(&net_def, device_type, input_names, output_names); mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
//7. 如果设备类型是OPENCL或HEXAGON,可以在此释放model_data //7. 如果设备类型是GPU或者HEXAGON,可以在此释放model_data
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data); mace::MACE_MODEL_TAG::UnloadModelData(model_data);
} }
......
...@@ -108,10 +108,8 @@ inline int64_t NowMicros() { ...@@ -108,10 +108,8 @@ inline int64_t NowMicros() {
DeviceType ParseDeviceType(const std::string &device_str) { DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) { if (device_str.compare("CPU") == 0) {
return DeviceType::CPU; return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) { } else if (device_str.compare("GPU") == 0) {
return DeviceType::NEON; return DeviceType::GPU;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) { } else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON; return DeviceType::HEXAGON;
} else { } else {
...@@ -198,7 +196,7 @@ bool Run(MaceEngine *engine, ...@@ -198,7 +196,7 @@ bool Run(MaceEngine *engine,
return true; return true;
} }
DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]"); DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
DEFINE_string(input_node, "input_node0,input_node1", DEFINE_string(input_node, "input_node0,input_node1",
"input nodes, separated by comma"); "input nodes, separated by comma");
DEFINE_string(output_node, "output_node0,output_node1", DEFINE_string(output_node, "output_node0,output_node1",
...@@ -279,7 +277,7 @@ int Main(int argc, char **argv) { ...@@ -279,7 +277,7 @@ int Main(int argc, char **argv) {
mace::SetOpenMPThreadPolicy( mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (device_type == DeviceType::OPENCL) { if (device_type == DeviceType::GPU) {
mace::SetGPUHints( mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint), static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
...@@ -347,7 +345,7 @@ int Main(int argc, char **argv) { ...@@ -347,7 +345,7 @@ int Main(int argc, char **argv) {
LOG(INFO) << "Run init"; LOG(INFO) << "Run init";
std::unique_ptr<mace::MaceEngine> engine_ptr( std::unique_ptr<mace::MaceEngine> engine_ptr(
new mace::MaceEngine(&net_def, device_type, input_names, output_names)); new mace::MaceEngine(&net_def, device_type, input_names, output_names));
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data); mace::MACE_MODEL_TAG::UnloadModelData(model_data);
} }
......
...@@ -141,10 +141,8 @@ std::string FormatName(const std::string input) { ...@@ -141,10 +141,8 @@ std::string FormatName(const std::string input) {
DeviceType ParseDeviceType(const std::string &device_str) { DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) { if (device_str.compare("CPU") == 0) {
return DeviceType::CPU; return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) { } else if (device_str.compare("GPU") == 0) {
return DeviceType::NEON; return DeviceType::GPU;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) { } else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON; return DeviceType::HEXAGON;
} else { } else {
...@@ -277,7 +275,7 @@ int Main(int argc, char **argv) { ...@@ -277,7 +275,7 @@ int Main(int argc, char **argv) {
FLAGS_gpu_model_data_file.c_str()); FLAGS_gpu_model_data_file.c_str());
NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names, mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names,
output_names); output_names);
mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
......
...@@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) { ...@@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) {
} }
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator()); MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator()); MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
#endif #endif
MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator()); MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
......
...@@ -189,7 +189,7 @@ class Image : public BufferBase { ...@@ -189,7 +189,7 @@ class Image : public BufferBase {
public: public:
Image() Image()
: BufferBase(0), : BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)), allocator_(GetDeviceAllocator(GPU)),
buf_(nullptr), buf_(nullptr),
mapped_buf_(nullptr) {} mapped_buf_(nullptr) {}
...@@ -198,7 +198,7 @@ class Image : public BufferBase { ...@@ -198,7 +198,7 @@ class Image : public BufferBase {
std::accumulate( std::accumulate(
shape.begin(), shape.end(), 1, std::multiplies<index_t>()) * shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
GetEnumTypeSize(data_type)), GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)), allocator_(GetDeviceAllocator(GPU)),
mapped_buf_(nullptr) { mapped_buf_(nullptr) {
shape_ = shape; shape_ = shape;
data_type_ = data_type; data_type_ = data_type;
......
...@@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run(
input_tensors.push_back(input_tensor); input_tensors.push_back(input_tensor);
} }
for (auto &output : *outputs) { for (auto &output : *outputs) {
if (device_type_ == DeviceType::OPENCL) { if (device_type_ == DeviceType::GPU) {
MACE_CHECK(output.second.shape().size() == 4, MACE_CHECK(output.second.shape().size() == 4,
"The outputs' shape must be 4-dimension with NHWC format," "The outputs' shape must be 4-dimension with NHWC format,"
" please use 1 to fill missing dimensions"); " please use 1 to fill missing dimensions");
...@@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run(
#endif #endif
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
if (device_type_ == OPENCL) { if (device_type_ == GPU) {
OpenCLRuntime::Global()->SaveBuiltCLProgram(); OpenCLRuntime::Global()->SaveBuiltCLProgram();
} }
#endif #endif
......
...@@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) { ...@@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
auto &op = *iter; auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), ")"); op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL && bool future_wait = (device_type_ == DeviceType::GPU &&
(run_metadata != nullptr || (run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1)); std::distance(iter, operators_.end()) == 1));
......
...@@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry); ...@@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry);
extern void Register_Eltwise(OperatorRegistry *op_registry); extern void Register_Eltwise(OperatorRegistry *op_registry);
extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry); extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
extern void Register_FullyConnected(OperatorRegistry *op_registry); extern void Register_FullyConnected(OperatorRegistry *op_registry);
extern void Register_FusedConv2D(OperatorRegistry *op_registry);
extern void Register_LocalResponseNorm(OperatorRegistry *op_registry); extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
extern void Register_MatMul(OperatorRegistry *op_registry); extern void Register_MatMul(OperatorRegistry *op_registry);
extern void Register_Pad(OperatorRegistry *op_registry); extern void Register_Pad(OperatorRegistry *op_registry);
...@@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry); ...@@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
extern void Register_Proposal(OperatorRegistry *op_registry); extern void Register_Proposal(OperatorRegistry *op_registry);
extern void Register_PSROIAlign(OperatorRegistry *op_registry); extern void Register_PSROIAlign(OperatorRegistry *op_registry);
extern void Register_Quantize(OperatorRegistry *op_registry); extern void Register_Quantize(OperatorRegistry *op_registry);
extern void Register_ReOrganize(OperatorRegistry *op_registry);
extern void Register_Requantize(OperatorRegistry *op_registry); extern void Register_Requantize(OperatorRegistry *op_registry);
extern void Register_Reshape(OperatorRegistry *op_registry); extern void Register_Reshape(OperatorRegistry *op_registry);
extern void Register_ResizeBilinear(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
...@@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() { ...@@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() {
ops::Register_Eltwise(this); ops::Register_Eltwise(this);
ops::Register_FoldedBatchNorm(this); ops::Register_FoldedBatchNorm(this);
ops::Register_FullyConnected(this); ops::Register_FullyConnected(this);
ops::Register_FusedConv2D(this);
ops::Register_LocalResponseNorm(this); ops::Register_LocalResponseNorm(this);
ops::Register_MatMul(this); ops::Register_MatMul(this);
ops::Register_Pad(this); ops::Register_Pad(this);
...@@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() { ...@@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() {
ops::Register_PSROIAlign(this); ops::Register_PSROIAlign(this);
ops::Register_Quantize(this); ops::Register_Quantize(this);
ops::Register_Requantize(this); ops::Register_Requantize(this);
ops::Register_ReOrganize(this);
ops::Register_Reshape(this); ops::Register_Reshape(this);
ops::Register_ResizeBilinear(this); ops::Register_ResizeBilinear(this);
ops::Register_Slice(this); ops::Register_Slice(this);
......
...@@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
if (model_data_size > 0) { if (model_data_size > 0) {
if (type == DeviceType::CPU || type == DeviceType::NEON) { if (type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), new Buffer(GetDeviceAllocator(type),
model_data_ptr, model_data_ptr,
...@@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
tensor_map_[const_tensor.name()] = std::move(tensor); tensor_map_[const_tensor.name()] = std::move(tensor);
} }
if (type == DeviceType::CPU || type == DeviceType::OPENCL) { if (type == DeviceType::CPU || type == DeviceType::GPU) {
CreateOutputTensorBuffer(net_def, type); CreateOutputTensorBuffer(net_def, type);
} }
} }
...@@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
} }
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block : net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (device_type == DeviceType::OPENCL) { if (device_type == DeviceType::GPU) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image({mem_block.x(), mem_block.y()}, dtype)); new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
...@@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype)); (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
tensor->SetSourceOpName(op.name()); tensor->SetSourceOpName(op.name());
if (device_type == DeviceType::OPENCL) { if (device_type == DeviceType::GPU) {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i] << " Mem: " << mem_ids[i]
<< " Image shape: " << " Image shape: "
...@@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
} }
ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
if (device_type == CPU || device_type == NEON) { if (device_type == CPU) {
return host_scratch_buffer_.get(); return host_scratch_buffer_.get();
} else { } else {
return nullptr; return nullptr;
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
* --input_file=input_data \ * --input_file=input_data \
* --output_file=mace.out \ * --output_file=mace.out \
* --model_data_file=model_data.data \ * --model_data_file=model_data.data \
* --device=OPENCL * --device=GPU
*/ */
#include <malloc.h> #include <malloc.h>
#include <stdint.h> #include <stdint.h>
...@@ -102,10 +102,8 @@ std::string FormatName(const std::string input) { ...@@ -102,10 +102,8 @@ std::string FormatName(const std::string input) {
DeviceType ParseDeviceType(const std::string &device_str) { DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) { if (device_str.compare("CPU") == 0) {
return DeviceType::CPU; return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) { } else if (device_str.compare("GPU") == 0) {
return DeviceType::NEON; return DeviceType::GPU;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) { } else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON; return DeviceType::HEXAGON;
} else { } else {
...@@ -135,7 +133,7 @@ DEFINE_string(output_file, ...@@ -135,7 +133,7 @@ DEFINE_string(output_file,
DEFINE_string(model_data_file, DEFINE_string(model_data_file,
"", "",
"model data file name, used when EMBED_MODEL_DATA set to 0"); "model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
DEFINE_int32(round, 1, "round"); DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
...@@ -160,7 +158,7 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -160,7 +158,7 @@ bool RunModel(const std::vector<std::string> &input_names,
MaceStatus res = mace::SetOpenMPThreadPolicy( MaceStatus res = mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (device_type == DeviceType::OPENCL) { if (device_type == DeviceType::GPU) {
mace::SetGPUHints( mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint), static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
...@@ -178,7 +176,7 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -178,7 +176,7 @@ bool RunModel(const std::vector<std::string> &input_names,
// Init model // Init model
mace::MaceEngine engine(&net_def, device_type, input_names, mace::MaceEngine engine(&net_def, device_type, input_names,
output_names); output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data); mace::MACE_MODEL_TAG::UnloadModelData(model_data);
} }
......
...@@ -162,7 +162,7 @@ class ActivationFunctor<DeviceType::CPU, float> { ...@@ -162,7 +162,7 @@ class ActivationFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> { class ActivationFunctor<DeviceType::GPU, T> {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {} : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
......
...@@ -93,7 +93,7 @@ struct AddNFunctor { ...@@ -93,7 +93,7 @@ struct AddNFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct AddNFunctor<DeviceType::OPENCL, T> { struct AddNFunctor<DeviceType::GPU, T> {
void operator()(const std::vector<const Tensor *> &input_tensors, void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
......
...@@ -128,7 +128,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase { ...@@ -128,7 +128,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
......
...@@ -65,7 +65,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> { ...@@ -65,7 +65,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct BiasAddFunctor<DeviceType::OPENCL, T> { struct BiasAddFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
......
...@@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { ...@@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
}; };
template <typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase { struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
explicit BufferToImageFunctor(bool i2b = false) explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {} : BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
......
...@@ -67,7 +67,7 @@ struct ChannelShuffleFunctor { ...@@ -67,7 +67,7 @@ struct ChannelShuffleFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { struct ChannelShuffleFunctor<DeviceType::GPU, T> {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
...@@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
......
...@@ -615,7 +615,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -615,7 +615,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides, Conv2dFunctor(const int *strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
......
...@@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor { ...@@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> { struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
DepthToSpaceOpFunctor(const int block_size, bool d2s) DepthToSpaceOpFunctor(const int block_size, bool d2s)
: block_size_(block_size), d2s_(d2s) {} : block_size_(block_size), d2s_(d2s) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
...@@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase { : DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides, DepthwiseConv2dFunctor(const int *strides,
const Padding padding_type, const Padding padding_type,
......
...@@ -363,7 +363,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase { ...@@ -363,7 +363,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float value) const float value)
......
...@@ -88,7 +88,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { ...@@ -88,7 +88,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(const int /*BufferType*/ weight_type, FullyConnectedFunctor(const int /*BufferType*/ weight_type,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
......
...@@ -75,7 +75,7 @@ struct MatMulFunctor { ...@@ -75,7 +75,7 @@ struct MatMulFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> { struct MatMulFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *A, void operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
......
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -46,7 +46,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -46,7 +46,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -124,7 +124,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -124,7 +124,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
} }
template struct ActivationFunctor<DeviceType::OPENCL, float>; template struct ActivationFunctor<DeviceType::GPU, float>;
template struct ActivationFunctor<DeviceType::OPENCL, half>; template struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,7 +22,7 @@ namespace mace { ...@@ -22,7 +22,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void AddNFunctor<DeviceType::OPENCL, T>::operator()( void AddNFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_tensors, const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
...@@ -58,7 +58,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -58,7 +58,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -119,9 +119,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -119,9 +119,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct AddNFunctor<DeviceType::OPENCL, float>; template struct AddNFunctor<DeviceType::GPU, float>;
template struct AddNFunctor<DeviceType::OPENCL, half>; template struct AddNFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
const Tensor *mean, const Tensor *mean,
...@@ -56,7 +56,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -56,7 +56,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -130,7 +130,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -130,7 +130,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
} }
template struct BatchNormFunctor<DeviceType::OPENCL, float>; template struct BatchNormFunctor<DeviceType::GPU, float>;
template struct BatchNormFunctor<DeviceType::OPENCL, half>; template struct BatchNormFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,7 +22,7 @@ namespace mace { ...@@ -22,7 +22,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -49,7 +49,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -49,7 +49,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -115,7 +115,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -115,7 +115,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
} }
template struct BiasAddFunctor<DeviceType::OPENCL, float>; template struct BiasAddFunctor<DeviceType::GPU, float>;
template struct BiasAddFunctor<DeviceType::OPENCL, half>; template struct BiasAddFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -20,7 +20,7 @@ namespace mace { ...@@ -20,7 +20,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
...@@ -95,7 +95,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -95,7 +95,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) { if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -177,8 +177,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -177,8 +177,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct BufferToImageFunctor<DeviceType::OPENCL, float>; template struct BufferToImageFunctor<DeviceType::GPU, float>;
template struct BufferToImageFunctor<DeviceType::OPENCL, half>; template struct BufferToImageFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -56,7 +56,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -56,7 +56,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -108,8 +108,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -108,8 +108,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
} }
template template
struct ChannelShuffleFunctor<DeviceType::OPENCL, float>; struct ChannelShuffleFunctor<DeviceType::GPU, float>;
template template
struct ChannelShuffleFunctor<DeviceType::OPENCL, half>; struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel,
} }
template <typename T> template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()( void ConcatFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -264,8 +264,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -264,8 +264,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct ConcatFunctor<DeviceType::OPENCL, float>; template struct ConcatFunctor<DeviceType::GPU, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>; template struct ConcatFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
template <typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
...@@ -128,8 +128,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -128,8 +128,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
} }
template struct Conv2dFunctor<DeviceType::OPENCL, float>; template struct Conv2dFunctor<DeviceType::GPU, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>; template struct Conv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
......
...@@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
......
...@@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
......
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t input_height = input->dim(1); const index_t input_height = input->dim(1);
...@@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>; template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, half>; template struct DepthToSpaceOpFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
} }
template <typename T> template <typename T>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
&kwg_size_, &kernel_error_); &kwg_size_, &kernel_error_);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>; template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -21,7 +21,7 @@ namespace mace { ...@@ -21,7 +21,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -74,7 +74,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -74,7 +74,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -129,7 +129,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -129,7 +129,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
} }
} }
template struct EltwiseFunctor<DeviceType::OPENCL, float>; template struct EltwiseFunctor<DeviceType::GPU, float>;
template struct EltwiseFunctor<DeviceType::OPENCL, half>; template struct EltwiseFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>( *kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0; *((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
...@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel,
} // namespace } // namespace
template <typename T> template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
...@@ -305,9 +305,9 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -305,9 +305,9 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>; template struct FullyConnectedFunctor<DeviceType::GPU, float>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, half>; template struct FullyConnectedFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -21,7 +21,7 @@ namespace mace { ...@@ -21,7 +21,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
...@@ -53,7 +53,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -53,7 +53,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -98,9 +98,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -98,9 +98,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
} }
} }
template struct MatMulFunctor<DeviceType::OPENCL, float>; template struct MatMulFunctor<DeviceType::GPU, float>;
template struct MatMulFunctor<DeviceType::OPENCL, half>; template struct MatMulFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer, ...@@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>( kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error->Map(nullptr); kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0; *(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap(); kernel_error->UnMap();
...@@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) { ...@@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels}; std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws; Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer", Tensor *buffer = ws.CreateTensor("Buffer",
GetDeviceAllocator(DeviceType::OPENCL), GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape); buffer->Resize(buffer_shape);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", Tensor *image = ws.CreateTensor("Image",
GetDeviceAllocator(DeviceType::OPENCL), GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
......
...@@ -21,7 +21,7 @@ namespace mace { ...@@ -21,7 +21,7 @@ namespace mace {
namespace kernels { namespace kernels {
template<typename T> template<typename T>
void PadFunctor<DeviceType::OPENCL, T>::operator()( void PadFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -59,7 +59,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,7 +59,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -115,9 +115,9 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -115,9 +115,9 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
} }
template template
struct PadFunctor<DeviceType::OPENCL, float>; struct PadFunctor<DeviceType::GPU, float>;
template template
struct PadFunctor<DeviceType::OPENCL, half>; struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -22,7 +22,7 @@ namespace mace { ...@@ -22,7 +22,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
...@@ -50,7 +50,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -50,7 +50,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -148,7 +148,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -148,7 +148,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
} }
template struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::GPU, float>;
template struct PoolingFunctor<DeviceType::OPENCL, half>; template struct PoolingFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t in_height = input->dim(1); const index_t in_height = input->dim(1);
...@@ -50,7 +50,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -50,7 +50,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -113,8 +113,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -113,8 +113,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>; template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>; template struct ResizeBilinearFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -21,7 +21,7 @@ namespace mace { ...@@ -21,7 +21,7 @@ namespace mace {
namespace kernels { namespace kernels {
template<typename T> template<typename T>
void SliceFunctor<DeviceType::OPENCL, T>::operator()( void SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future) { StatsFuture *future) {
...@@ -51,7 +51,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -51,7 +51,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -133,9 +133,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -133,9 +133,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
} }
template template
struct SliceFunctor<DeviceType::OPENCL, float>; struct SliceFunctor<DeviceType::GPU, float>;
template template
struct SliceFunctor<DeviceType::OPENCL, half>; struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -23,7 +23,7 @@ namespace mace { ...@@ -23,7 +23,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = logits->dim(0); const index_t batch = logits->dim(0);
...@@ -49,7 +49,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -49,7 +49,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -95,7 +95,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -95,7 +95,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
} }
} }
template struct SoftmaxFunctor<DeviceType::OPENCL, float>; template struct SoftmaxFunctor<DeviceType::GPU, float>;
template struct SoftmaxFunctor<DeviceType::OPENCL, half>; template struct SoftmaxFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -25,7 +25,7 @@ namespace mace { ...@@ -25,7 +25,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
Tensor *batch_tensor, Tensor *batch_tensor,
...@@ -60,7 +60,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -60,7 +60,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -120,8 +120,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -120,8 +120,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>; template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
template struct SpaceToBatchFunctor<DeviceType::OPENCL, half>; template struct SpaceToBatchFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -22,7 +22,7 @@ namespace mace { ...@@ -22,7 +22,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -39,7 +39,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -39,7 +39,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -117,7 +117,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -117,7 +117,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
} }
template <typename T> template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
...@@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>( kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
...@@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template struct WinogradTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradTransformFunctor<DeviceType::GPU, float>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, half>; template struct WinogradTransformFunctor<DeviceType::GPU, half>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradInverseTransformFunctor<DeviceType::GPU, float>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>; template struct WinogradInverseTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase { ...@@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase {
std::fill(output_ptr, output_ptr + output->size(), this->constant_value_); std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t channel = input->dim(1);
const index_t width = input->dim(2); const index_t height = input->dim(2);
const index_t channel = input->dim(3); const index_t width = input->dim(3);
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) { const index_t in_offset = (((b * channel + c) * height) + h) * width;
const index_t in_offset = (((b * height + h) * width) + w) * channel;
const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1) const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
+ (h + this->paddings_[2])) * output->dim(2) + (c + this->paddings_[2])) * output->dim(2)
+ (w + this->paddings_[4])) * output->dim(3) + (h + this->paddings_[4])) * output->dim(3)
+ this->paddings_[6]; + this->paddings_[6];
memcpy(output_ptr + out_offset, memcpy(output_ptr + out_offset,
input_ptr + in_offset, input_ptr + in_offset,
channel * sizeof(T)); width * sizeof(T));
} }
} }
} }
...@@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase { ...@@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase { struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
PadFunctor(const std::vector<int> &paddings, PadFunctor(const std::vector<int> &paddings,
const float constant_value) const float constant_value)
: PadFunctorBase(paddings, constant_value) {} : PadFunctorBase(paddings, constant_value) {}
......
...@@ -261,7 +261,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -261,7 +261,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(const PoolingType pooling_type,
const int *kernels, const int *kernels,
const int *strides, const int *strides,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_REORGANIZE_H_
#define MACE_KERNELS_REORGANIZE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct ReOrganizeFunctor {
void operator()(const Tensor *input,
const std::vector<index_t> &out_shape,
Tensor *output,
StatsFuture *future) {
const bool w2c = out_shape[3] > input->dim(3);
const index_t height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_chan = input->dim(3);
const index_t output_width = output->dim(2);
const index_t output_chan = output->dim(3);
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
if (w2c) {
MACE_CHECK((out_shape[3] % input->dim(3)) == 0);
const index_t multiplier = out_shape[3] / input->dim(3);
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < out_shape[0]; ++n) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
for (index_t c = 0; c < out_shape[3]; ++c) {
const index_t out_offset =
((n * height + h) * output_width + w)
* output_chan + c;
const index_t in_w_idx = w + (c % multiplier) * output_width;
const index_t in_chan_idx = c / multiplier;
const index_t in_offset =
((n * height + h) * input_width + in_w_idx)
* input_chan + in_chan_idx;
output_ptr[out_offset] = input_ptr[in_offset];
}
}
}
}
} else {
MACE_CHECK((input->dim(3) % out_shape[3]) == 0);
const index_t multiplier = input->dim(3) / out_shape[3];
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < out_shape[0]; ++n) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
for (index_t c = 0; c < out_shape[3]; ++c) {
const index_t out_offset =
((n * height + h) * output_width + w)
* output_chan + c;
const index_t in_w_idx = w % input_width;
const index_t in_chan_idx = w / input_width + c * multiplier;
const index_t in_offset =
((n * height + h) * input_width + in_w_idx)
* input_chan + in_chan_idx;
output_ptr[out_offset] = input_ptr[in_offset];
}
}
}
}
}
}
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_REORGANIZE_H_
...@@ -179,7 +179,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -179,7 +179,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T> struct ResizeBilinearFunctor<DeviceType::GPU, T>
: ResizeBilinearFunctorBase { : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {} : ResizeBilinearFunctorBase(size, align_corners) {}
......
...@@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase { ...@@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase { struct SliceFunctor<DeviceType::GPU, T> : SliceFunctorBase {
explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
......
...@@ -94,7 +94,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> { ...@@ -94,7 +94,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct SoftmaxFunctor<DeviceType::OPENCL, T> { struct SoftmaxFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
......
...@@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { ...@@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s) bool b2s)
......
...@@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase { ...@@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> struct WinogradTransformFunctor<DeviceType::GPU, T>
: WinogradTransformFunctorBase { : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
...@@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
: WinogradInverseTransformFunctorBase { : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch, WinogradInverseTransformFunctor(const int batch,
const int height, const int height,
......
...@@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ActivationOp<DeviceType::OPENCL, float>); ActivationOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ActivationOp<DeviceType::OPENCL, half>); ActivationOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -33,7 +33,7 @@ void ReluBenchmark( ...@@ -33,7 +33,7 @@ void ReluBenchmark(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -45,7 +45,7 @@ void ReluBenchmark( ...@@ -45,7 +45,7 @@ void ReluBenchmark(
.Output("Output") .Output("Output")
.AddStringArg("activation", "RELU") .AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -83,8 +83,8 @@ void ReluBenchmark( ...@@ -83,8 +83,8 @@ void ReluBenchmark(
#define BM_RELU(N, C, H, W) \ #define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \ BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, OPENCL); \ BM_RELU_MACRO(N, C, H, W, float, GPU); \
BM_RELU_MACRO(N, C, H, W, half, OPENCL); BM_RELU_MACRO(N, C, H, W, half, GPU);
BM_RELU(1, 1, 512, 512); BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128); BM_RELU(1, 3, 128, 128);
...@@ -107,7 +107,7 @@ void ReluxBenchmark( ...@@ -107,7 +107,7 @@ void ReluxBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -151,8 +151,8 @@ void ReluxBenchmark( ...@@ -151,8 +151,8 @@ void ReluxBenchmark(
#define BM_RELUX(N, C, H, W) \ #define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \ BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \ BM_RELUX_MACRO(N, C, H, W, float, GPU); \
BM_RELUX_MACRO(N, C, H, W, half, OPENCL); BM_RELUX_MACRO(N, C, H, W, half, GPU);
BM_RELUX(1, 1, 512, 512); BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128); BM_RELUX(1, 3, 128, 128);
...@@ -171,7 +171,7 @@ void PreluBenchmark( ...@@ -171,7 +171,7 @@ void PreluBenchmark(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -185,7 +185,7 @@ void PreluBenchmark( ...@@ -185,7 +185,7 @@ void PreluBenchmark(
.Output("Output") .Output("Output")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage", BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
...@@ -226,8 +226,8 @@ void PreluBenchmark( ...@@ -226,8 +226,8 @@ void PreluBenchmark(
#define BM_PRELU(N, C, H, W) \ #define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \ BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \ BM_PRELU_MACRO(N, C, H, W, float, GPU); \
BM_PRELU_MACRO(N, C, H, W, half, OPENCL); BM_PRELU_MACRO(N, C, H, W, half, GPU);
BM_PRELU(1, 1, 512, 512); BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128); BM_PRELU(1, 3, 128, 128);
...@@ -250,7 +250,7 @@ void TanhBenchmark( ...@@ -250,7 +250,7 @@ void TanhBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -292,8 +292,8 @@ void TanhBenchmark( ...@@ -292,8 +292,8 @@ void TanhBenchmark(
#define BM_TANH(N, C, H, W) \ #define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \ BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, OPENCL); \ BM_TANH_MACRO(N, C, H, W, float, GPU); \
BM_TANH_MACRO(N, C, H, W, half, OPENCL); BM_TANH_MACRO(N, C, H, W, half, GPU);
BM_TANH(1, 1, 512, 512); BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128); BM_TANH(1, 3, 128, 128);
...@@ -316,7 +316,7 @@ void SigmoidBenchmark( ...@@ -316,7 +316,7 @@ void SigmoidBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -359,8 +359,8 @@ void SigmoidBenchmark( ...@@ -359,8 +359,8 @@ void SigmoidBenchmark(
#define BM_SIGMOID(N, C, H, W) \ #define BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \ BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL); BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
BM_SIGMOID(1, 1, 512, 512); BM_SIGMOID(1, 1, 512, 512);
BM_SIGMOID(1, 3, 128, 128); BM_SIGMOID(1, 3, 128, 128);
......
...@@ -31,7 +31,7 @@ void TestSimpleRelu() { ...@@ -31,7 +31,7 @@ void TestSimpleRelu() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -68,7 +68,7 @@ void TestSimpleRelu() { ...@@ -68,7 +68,7 @@ void TestSimpleRelu() {
TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimpleRelu) { TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
TestSimpleRelu<DeviceType::OPENCL>(); TestSimpleRelu<DeviceType::GPU>();
} }
namespace { namespace {
...@@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() { ...@@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) { ...@@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
} }
TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::OPENCL>(); TestUnalignedSimpleRelu<DeviceType::GPU>();
} }
...@@ -131,7 +131,7 @@ void TestSimpleRelux() { ...@@ -131,7 +131,7 @@ void TestSimpleRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -170,7 +170,7 @@ void TestSimpleRelux() { ...@@ -170,7 +170,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) { TEST_F(ActivationOpTest, OPENCLSimple) {
TestSimpleRelux<DeviceType::OPENCL>(); TestSimpleRelux<DeviceType::GPU>();
} }
namespace { namespace {
...@@ -183,7 +183,7 @@ void TestSimpleReluRelux() { ...@@ -183,7 +183,7 @@ void TestSimpleReluRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) { ...@@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
} }
TEST_F(ActivationOpTest, OPENCLSimpleRelux) { TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
TestSimpleReluRelux<DeviceType::OPENCL>(); TestSimpleReluRelux<DeviceType::GPU>();
} }
namespace { namespace {
...@@ -238,7 +238,7 @@ void TestSimplePrelu() { ...@@ -238,7 +238,7 @@ void TestSimplePrelu() {
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}); net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage", BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
...@@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) { ...@@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) {
} }
TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::OPENCL>(); TestSimplePrelu<DeviceType::GPU>();
} }
namespace { namespace {
...@@ -296,7 +296,7 @@ void TestSimpleTanh() { ...@@ -296,7 +296,7 @@ void TestSimpleTanh() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -336,7 +336,7 @@ void TestSimpleTanh() { ...@@ -336,7 +336,7 @@ void TestSimpleTanh() {
TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); } TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimpleTanh) { TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
TestSimpleTanh<DeviceType::OPENCL>(); TestSimpleTanh<DeviceType::GPU>();
} }
namespace { namespace {
...@@ -349,7 +349,7 @@ void TestSimpleSigmoid() { ...@@ -349,7 +349,7 @@ void TestSimpleSigmoid() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) { ...@@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
} }
TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) { TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
TestSimpleSigmoid<DeviceType::OPENCL>(); TestSimpleSigmoid<DeviceType::GPU>();
} }
} // namespace test } // namespace test
......
...@@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
AddNOp<DeviceType::OPENCL, float>); AddNOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
AddNOp<DeviceType::OPENCL, half>); AddNOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c}); net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(&net, MakeString("Input", i).c_str(), BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
MakeString("InputImage", i).c_str(), MakeString("InputImage", i).c_str(),
...@@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
#define BM_ADDN(INPUTS, N, H, W, C) \ #define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL); BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
BM_ADDN(2, 1, 256, 256, 32); BM_ADDN(2, 1, 256, 256, 32);
BM_ADDN(2, 1, 128, 128, 32); BM_ADDN(2, 1, 128, 128, 32);
......
...@@ -64,7 +64,7 @@ void SimpleAdd3() { ...@@ -64,7 +64,7 @@ void SimpleAdd3() {
{-0.1582, 2, 3, 4, 5, 6}); {-0.1582, 2, 3, 4, 5, 6});
const int input_num = 4; const int input_num = 4;
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
// run on gpu // run on gpu
for (int i = 0; i < input_num; ++i) { for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i), BufferToImage<D, half>(&net, MakeString("Input", i),
...@@ -105,7 +105,7 @@ void SimpleAdd3() { ...@@ -105,7 +105,7 @@ void SimpleAdd3() {
} // namespace } // namespace
TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); } TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::OPENCL>(); } TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::GPU>(); }
namespace { namespace {
template <DeviceType D> template <DeviceType D>
...@@ -166,7 +166,7 @@ void RandomTest() { ...@@ -166,7 +166,7 @@ void RandomTest() {
} }
} // namespace } // namespace
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); } TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::GPU>(); }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchNormOp<DeviceType::OPENCL, float>); BatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BatchNormOp<DeviceType::OPENCL, half>); BatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -32,7 +32,7 @@ void BatchNorm( ...@@ -32,7 +32,7 @@ void BatchNorm(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -52,7 +52,7 @@ void BatchNorm( ...@@ -52,7 +52,7 @@ void BatchNorm(
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage", BufferToImage<D, float>(&net, "Scale", "ScaleImage",
...@@ -107,8 +107,8 @@ void BatchNorm( ...@@ -107,8 +107,8 @@ void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W) \ #define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \ BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL); BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
BM_BATCH_NORM(1, 1, 512, 512); BM_BATCH_NORM(1, 1, 512, 512);
BM_BATCH_NORM(1, 3, 128, 128); BM_BATCH_NORM(1, 3, 128, 128);
......
...@@ -49,7 +49,7 @@ void Simple() { ...@@ -49,7 +49,7 @@ void Simple() {
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage", BufferToImage<D, float>(&net, "Scale", "ScaleImage",
...@@ -90,7 +90,7 @@ void Simple() { ...@@ -90,7 +90,7 @@ void Simple() {
TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); } TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); } TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// generate random input // generate random input
...@@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Tuning // Tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
...@@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage", BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage", BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
...@@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING"); unsetenv("MACE_TUNING");
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
} }
......
...@@ -20,15 +20,15 @@ namespace ops { ...@@ -20,15 +20,15 @@ namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) { void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, float>); BatchToSpaceNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, half>); BatchToSpaceNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -63,7 +63,7 @@ void BMBatchToSpace( ...@@ -63,7 +63,7 @@ void BMBatchToSpace(
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, OPENCL); BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
......
...@@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BiasAddOp<DeviceType::OPENCL, float>); BiasAddOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BiasAddOp<DeviceType::OPENCL, half>); BiasAddOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
.Input("Bias") .Input("Bias")
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Bias", "BiasImage", BufferToImage<D, T>(&net, "Bias", "BiasImage",
...@@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
#define BM_BIAS_ADD(N, C, H, W) \ #define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \ BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL); BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
BM_BIAS_ADD(1, 1, 512, 512); BM_BIAS_ADD(1, 1, 512, 512);
BM_BIAS_ADD(1, 3, 128, 128); BM_BIAS_ADD(1, 3, 128, 128);
......
...@@ -47,7 +47,7 @@ void BiasAddSimple() { ...@@ -47,7 +47,7 @@ void BiasAddSimple() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage", BufferToImage<D, float>(&net, "Bias", "BiasImage",
...@@ -80,7 +80,7 @@ void BiasAddSimple() { ...@@ -80,7 +80,7 @@ void BiasAddSimple() {
TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); } TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
BiasAddSimple<DeviceType::OPENCL>(); BiasAddSimple<DeviceType::GPU>();
} }
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
...@@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
...@@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
...@@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
...@@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
......
...@@ -19,16 +19,16 @@ namespace ops { ...@@ -19,16 +19,16 @@ namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) { void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::OPENCL, float>); BufferToImageOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::OPENCL, half>); BufferToImageOp<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type, ...@@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgSmall) { TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
} }
TEST(BufferToImageTest, ArgHalfSmall) { TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
} }
TEST(BufferToImageTest, ArgMedia) { TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
} }
TEST(BufferToImageTest, ArgLarge) { TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256}); TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
} }
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1}); {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3}); {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3}); {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedia) { TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128}); {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256}); {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 3, 5}); {1, 1, 3, 5});
} }
TEST(BufferToImageTest, Filter1x1Media) { TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 13, 17}); {1, 1, 13, 17});
} }
TEST(BufferToImageTest, Filter1x1Large) { TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 128, 512}); {1, 1, 128, 512});
} }
TEST(BufferToImageTest, Filter3x3Small) { TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 3, 5}); {3, 3, 3, 5});
} }
TEST(BufferToImageTest, Filter3x3Meida) { TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 13, 17}); {3, 3, 13, 17});
} }
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 128, 256}); {3, 3, 128, 256});
} }
...@@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type, ...@@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgFloatToHalfSmall) { TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
{11}); {11});
} }
...@@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>( TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
kernels::ARGUMENT, {2}, input_data); kernels::ARGUMENT, {2}, input_data);
} }
......
...@@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ChannelShuffleOp<DeviceType::OPENCL, float>); ChannelShuffleOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ChannelShuffleOp<DeviceType::OPENCL, half>); ChannelShuffleOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator<D, T> { ...@@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator<D, T> {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
int channels; int channels;
if (D == OPENCL) { if (D == GPU) {
channels = input->dim(3); channels = input->dim(3);
} else if (D == CPU) { } else if (D == CPU) {
channels = input->dim(1); channels = input->dim(1);
......
...@@ -31,7 +31,7 @@ void ChannelShuffle( ...@@ -31,7 +31,7 @@ void ChannelShuffle(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, height, channels, width}); net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -42,7 +42,7 @@ void ChannelShuffle( ...@@ -42,7 +42,7 @@ void ChannelShuffle(
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -82,8 +82,8 @@ void ChannelShuffle( ...@@ -82,8 +82,8 @@ void ChannelShuffle(
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ #define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \ BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL); BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
......
...@@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<DeviceType::OPENCL, float>( net.AddInputFromArray<DeviceType::GPU, float>(
"Input", {1, 1, 2, 16}, "Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
...@@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
// Transfer output // Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
......
...@@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ConcatOp<DeviceType::OPENCL, float>); ConcatOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ConcatOp<DeviceType::OPENCL, half>); ConcatOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters, ...@@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImage0", BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImage1", BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM") OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0") .Input("InputImage0")
...@@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters, ...@@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters,
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
} }
const int64_t tot = const int64_t tot =
...@@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters, ...@@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters,
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
} }
} }
} // namespace } // namespace
......
...@@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]); GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::OPENCL, float>(input_name, net.AddInputFromArray<DeviceType::GPU, float>(input_name,
shapes[i], inputs[i]); shapes[i], inputs[i]);
BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name, BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
...@@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
......
...@@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Conv2dOp<DeviceType::OPENCL, float>); Conv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
Conv2dOp<DeviceType::OPENCL, half>); Conv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#define MACE_OPS_CONV_2D_H_ #define MACE_OPS_CONV_2D_H_
#include <memory> #include <memory>
#include <string>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
...@@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->padding_type_, this->padding_type_,
this->paddings_, this->paddings_,
this->dilations_.data(), this->dilations_.data(),
kernels::ActivationType::NOOP, kernels::StringToActivationType(
0.0f, OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetSingleArgument<int>( static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)), "is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {} ws->GetScratchBuffer(D)) {}
......
...@@ -47,7 +47,7 @@ void Conv2d(int iters, ...@@ -47,7 +47,7 @@ void Conv2d(int iters,
{output_channels, channels, kernel_h, {output_channels, channels, kernel_h,
kernel_w}); kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter", net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, output_channels, {kernel_h, kernel_w, output_channels,
...@@ -68,7 +68,7 @@ void Conv2d(int iters, ...@@ -68,7 +68,7 @@ void Conv2d(int iters,
.AddIntsArg("dilations", {dilation, dilation}) .AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -138,8 +138,8 @@ void Conv2d(int iters, ...@@ -138,8 +138,8 @@ void Conv2d(int iters,
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
......
...@@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) { ...@@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) {
} }
TEST_F(Conv2dOpTest, OPENCLSimple) { TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>(); TestNHWCSimple3x3VALID<DeviceType::GPU, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>(); TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
} }
namespace { namespace {
...@@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) { ...@@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) {
} }
TEST_F(Conv2dOpTest, OPENCLWithoutBias) { TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>(); TestNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
} }
namespace { namespace {
...@@ -301,7 +301,7 @@ void TestNHWCCombined3x3() { ...@@ -301,7 +301,7 @@ void TestNHWCCombined3x3() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) { ...@@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) {
} }
TEST_F(Conv2dOpTest, OPENCLStride2) { TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::OPENCL, float>(); TestNHWCCombined3x3<DeviceType::GPU, float>();
} }
namespace {
template<DeviceType D, typename T>
void TestFusedNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
template<DeviceType D, typename T>
void TestFusedNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
} // namespace
TEST_F(Conv2dOpTest, FusedCPUSimple) {
TestFusedNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestFusedNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>();
TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
}
namespace { namespace {
template<DeviceType D> template<DeviceType D>
void TestConv1x1() { void TestConv1x1() {
...@@ -389,7 +544,7 @@ void TestConv1x1() { ...@@ -389,7 +544,7 @@ void TestConv1x1() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Filter", "FilterImage", BufferToImage<D, float>(&net, "Filter", "FilterImage",
...@@ -431,7 +586,7 @@ void TestConv1x1() { ...@@ -431,7 +586,7 @@ void TestConv1x1() {
TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); } TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); } TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
namespace { namespace {
template<DeviceType D, typename T> template<DeviceType D, typename T>
...@@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape, ...@@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1); TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2); TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1); TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2); TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3); TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4); TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
} }
namespace { namespace {
...@@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64}, TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7}, TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7}, TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16},
{2, 2}); {2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16},
{2, 2}); {2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16}, TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16},
{4, 4}); {4, 4});
} }
...@@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector<index_t> &shape, ...@@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) { TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2); TestDilationConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, 2);
} }
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) { TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4); TestDilationConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, 4);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) { TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4); TestDilationConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, 4);
}
namespace {
template<DeviceType D>
void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t output_channels = filter_shape[2];
index_t input_channels = filter_shape[3];
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", dilations)
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", dilations)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
};
func(1, 1, VALID);
func(1, 1, SAME);
}
} // namespace
TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
{2, 2});
} }
namespace { namespace {
...@@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, ...@@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
} // namespace } // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedPad1) { TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1}); TestArbitraryPadConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
{2, 2}); {2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4}); TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4});
} }
} // namespace test } // namespace test
......
...@@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) {
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = Tensor *input =
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::OPENCL), ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3}); input->Resize({1, 3, 3, 3});
{ {
...@@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) { ...@@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) {
} }
std::shared_ptr<OperatorRegistry> op_registry(new OperatorRegistry()); std::shared_ptr<OperatorRegistry> op_registry(new OperatorRegistry());
auto net = auto net =
CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL, NetMode::INIT); CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
net->Run(); net->Run();
EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
EXPECT_TRUE(ws.GetTensor("Output") == nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
net = CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL); net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
net->Run(); net->Run();
EXPECT_TRUE(ws.GetTensor("Output") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
......
...@@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthToSpaceOp<DeviceType::OPENCL, float>); DepthToSpaceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
DepthToSpaceOp<DeviceType::OPENCL, half>); DepthToSpaceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator<D, T> { ...@@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator<D, T> {
int input_depth; int input_depth;
if (D == CPU) { if (D == CPU) {
input_depth = input->dim(1); input_depth = input->dim(1);
} else if (D == OPENCL) { } else if (D == GPU) {
input_depth = input->dim(3); input_depth = input->dim(3);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
......
...@@ -31,7 +31,7 @@ void DepthToSpace( ...@@ -31,7 +31,7 @@ void DepthToSpace(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -42,7 +42,7 @@ void DepthToSpace( ...@@ -42,7 +42,7 @@ void DepthToSpace(
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -82,8 +82,8 @@ void DepthToSpace( ...@@ -82,8 +82,8 @@ void DepthToSpace(
#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ #define BM_DEPTH_TO_SPACE(N, C, H, W, G) \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \ BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL); BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
......
...@@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s, ...@@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s,
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = CreateTensor<float>(expected_shape, expected_data);
...@@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { ...@@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
} }
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 4, 4}, RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2,
...@@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { ...@@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
} }
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 2, 4}, RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, {1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16}, 9, 10, 11, 12, 13, 14, 15, 16},
2, 2,
...@@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { ...@@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
} }
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 2, 16}, RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, 2,
...@@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { ...@@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 1, 16}, RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, {1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16}, 9, 10, 11, 12, 13, 14, 15, 16},
2, 2,
...@@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { ...@@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0); const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0);
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 192, 192, 128}, RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128},
in, in,
2, 2,
{1, 384, 384, 32}, {1, 384, 384, 32},
...@@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size, ...@@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size,
} // namespace } // namespace
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128});
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::OPENCL, half>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::OPENCL, float>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::OPENCL, half>(false, 2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
} }
} // namespace test } // namespace test
......
...@@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, float>); DepthwiseConv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, half>); DepthwiseConv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters, ...@@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters,
net.AddRandomInput<D, float>( net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w}); "Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}); net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels}); {batch, height, width, input_channels});
net.AddRandomInput<D, float>( net.AddRandomInput<D, float>(
...@@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters, ...@@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters,
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters, ...@@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters,
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
......
...@@ -59,7 +59,7 @@ void SimpleValidTest() { ...@@ -59,7 +59,7 @@ void SimpleValidTest() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { ...@@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
} }
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) { TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
SimpleValidTest<DeviceType::OPENCL, float>(); SimpleValidTest<DeviceType::GPU, float>();
} }
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
SimpleValidTest<DeviceType::OPENCL, half>(); SimpleValidTest<DeviceType::GPU, half>();
} }
namespace { namespace {
...@@ -184,7 +184,7 @@ void ComplexValidTest() { ...@@ -184,7 +184,7 @@ void ComplexValidTest() {
NCHW, NCHW,
"Output", "Output",
NHWC); NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage", BufferToImage<D, T>(&net, "Filter", "FilterImage",
...@@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) { ...@@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
} }
TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) { TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
ComplexValidTest<DeviceType::OPENCL, float>(); ComplexValidTest<DeviceType::GPU, float>();
} }
TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
ComplexValidTest<DeviceType::OPENCL, half>(); ComplexValidTest<DeviceType::GPU, half>();
} }
namespace { namespace {
...@@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, {batch, height, width,
input_channels}); input_channels});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier}); "Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier {multiplier
* input_channels}); * input_channels});
...@@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) {
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Filter", "FilterImage", BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER); kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage") .Input("InputImage")
...@@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) {
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
// Transfer output // Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net, ImageToBuffer<DeviceType::GPU, float>(&net,
"OutputImage", "OutputImage",
"DeviceOutput", "DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
......
...@@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
EltwiseOp<DeviceType::OPENCL, float>); EltwiseOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
EltwiseOp<DeviceType::OPENCL, half>); EltwiseOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -34,10 +34,10 @@ void EltwiseBenchmark( ...@@ -34,10 +34,10 @@ void EltwiseBenchmark(
net.AddRandomInput<D, T>("Input0", {n, h, w, c}); net.AddRandomInput<D, T>("Input0", {n, h, w, c});
net.AddRandomInput<D, T>("Input1", {n, h, w, c}); net.AddRandomInput<D, T>("Input1", {n, h, w, c});
if (D == DeviceType::OPENCL) { if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input0", "InputImg0", BufferToImage<D, half>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Input1", "InputImg1", BufferToImage<D, half>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0") .Input("InputImg0")
...@@ -90,8 +90,8 @@ void EltwiseBenchmark( ...@@ -90,8 +90,8 @@ void EltwiseBenchmark(
#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \ #define BM_ELTWISE(ELT_TYPE, N, H, W, C) \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \ BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL); BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
BM_ELTWISE(2, 1, 128, 128, 32); BM_ELTWISE(2, 1, 128, 128, 32);
BM_ELTWISE(2, 1, 240, 240, 256); BM_ELTWISE(2, 1, 240, 240, 256);
......
...@@ -183,45 +183,45 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) { ...@@ -183,45 +183,45 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) { TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUM, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1, {1, 1, 1, 1}, {1}, 1,
{2}); {2});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUB, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
1, 1,
{0, 1, 2, 3, 4, 5}); {0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
2, 2,
{2, 4, 6, 8, 10, 12}); {2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::DIV, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3}, {1, 1, 2, 3},
{2, 4, 6, 8, 10, 12}, {2, 4, 6, 8, 10, 12},
2, 2,
{1, 2, 3, 4, 5, 6}); {1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MIN, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
1, 1,
{1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MAX, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
3, 3,
{3, 3, 3, 4, 5, 6}); {3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::NEG, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
3, 3,
{-1, -2, -3, -4, -5, -6}); {-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::ABS, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS,
{1, 1, 2, 3}, {1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6}, {-1, -2, -3, -4, -5, -6},
3, 3,
{1, 2, 3, 4, 5, 6}); {1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SQR_DIFF, SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3}, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
1, 1,
...@@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { ...@@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) { TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 3}, {1, 2, 3}, {1, 1, 1, 3}, {1, 2, 3},
{2, 4, 6, 5, 7, 9}); {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5}); {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, -5, -5, -5, -5, -5}); {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD, kernels::EltwiseType::PROD,
{1, 1, 1, 3}, {1, 2, 3}, {1, 1, 1, 3}, {1, 2, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 4, 10, 18}); {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 1, 1, 5}, {1, 1, 1, 1, 5},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2}); {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV,
{1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 1, 1, 5}, {1, 1, 1, 2, 4},
{1, 2, 1, 5}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4}); {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN, kernels::EltwiseType::MIN,
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX, kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF, kernels::EltwiseType::SQR_DIFF,
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
...@@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) { ...@@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
25}); 25});
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) { TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12}); {2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM, kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.2, 0.4, 0.6, 0.8, 1, 1.2},
{0.1, 0.1}); {0.1, 0.1});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB, kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0}); {0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD, kernels::EltwiseType::PROD,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36}); {1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV, kernels::EltwiseType::DIV,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN, kernels::EltwiseType::MIN,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX, kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::OPENCL, float>( SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF, kernels::EltwiseType::SQR_DIFF,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
...@@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", shape, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImg", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg") .Input("InputImg")
...@@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2); 1e-2);
} }
} }
...@@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1, true, true); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC,
"TInput0", NCHW); "TInput0", NCHW);
...@@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImg0", BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImg1", BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0") .Input("InputImg0")
...@@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2); 1e-2);
} }
} }
......
...@@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>); FoldedBatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>); FoldedBatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -60,7 +60,7 @@ void Simple() { ...@@ -60,7 +60,7 @@ void Simple() {
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage", BufferToImage<D, float>(&net, "Scale", "ScaleImage",
...@@ -94,7 +94,7 @@ void Simple() { ...@@ -94,7 +94,7 @@ void Simple() {
TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); } TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); } TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// generate random input // generate random input
...@@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
} }
...@@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
} }
...@@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC, NHWC,
...@@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage", BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage", BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
...@@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
} }
......
...@@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) { ...@@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FullyConnectedOp<DeviceType::OPENCL, float>); FullyConnectedOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
.Device(DeviceType::OPENCL) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
FullyConnectedOp<DeviceType::OPENCL, half>); FullyConnectedOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -43,7 +43,7 @@ void FCBenchmark( ...@@ -43,7 +43,7 @@ void FCBenchmark(
.Input("Bias") .Input("Bias")
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH; kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
BufferToImage<D, T>(&net, "Weight", "WeightImage", BufferToImage<D, T>(&net, "Weight", "WeightImage",
weight_type); weight_type);
...@@ -93,8 +93,8 @@ void FCBenchmark( ...@@ -93,8 +93,8 @@ void FCBenchmark(
#define BM_FC(N, H, W, C, OC) \ #define BM_FC(N, H, W, C, OC) \
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
BM_FC_MACRO(N, H, W, C, OC, float, OPENCL); \ BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
BM_FC_MACRO(N, H, W, C, OC, half, OPENCL); BM_FC_MACRO(N, H, W, C, OC, half, GPU);
BM_FC(1, 16, 16, 32, 32); BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000); BM_FC(1, 8, 8, 32, 1000);
......
...@@ -51,7 +51,7 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -51,7 +51,7 @@ void Simple(const std::vector<index_t> &input_shape,
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Weight", "WeightImage", BufferToImage<D, float>(&net, "Weight", "WeightImage",
...@@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { ...@@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
} }
TEST_F(FullyConnectedOpTest, SimpleOPENCL) { TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8}, Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206}); {206});
Simple<DeviceType::OPENCL>( Simple<DeviceType::GPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10}, {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::OPENCL>( Simple<DeviceType::GPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6}, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
...@@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) { ...@@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
} }
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4}, Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
} }
...@@ -136,11 +136,11 @@ void Complex(const index_t batch, ...@@ -136,11 +136,11 @@ void Complex(const index_t batch,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, height * width * channels}); "Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel}); net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest") OpDefBuilder("FC", "FullyConnectedTest")
.Input("Input") .Input("Input")
...@@ -159,11 +159,11 @@ void Complex(const index_t batch, ...@@ -159,11 +159,11 @@ void Complex(const index_t batch,
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage", BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
kernels::BufferType::WEIGHT_HEIGHT); kernels::BufferType::WEIGHT_HEIGHT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest") OpDefBuilder("FC", "FullyConnectedTest")
...@@ -176,9 +176,9 @@ void Complex(const index_t batch, ...@@ -176,9 +176,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
...@@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch, ...@@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, height * width * channels}); "Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel}); net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest") OpDefBuilder("FC", "FullyConnectedTest")
.Input("Input") .Input("Input")
...@@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch, ...@@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch,
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage", BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
kernels::BufferType::WEIGHT_WIDTH); kernels::BufferType::WEIGHT_WIDTH);
BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage", BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT); kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest") OpDefBuilder("FC", "FullyConnectedTest")
...@@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch, ...@@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::OPENCL); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/fused_conv_2d.h"
namespace mace {
namespace ops {
void Register_FusedConv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, half>);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
} // namespace mace
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册