提交 3ba07e0b 编写于 作者: L liuqi

Change DeviceType::OPENCL to DeviceType::GPU and fix some bugs.

上级 d98386d6
......@@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) {
Custom_Op<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Custom_Op<DeviceType::OPENCL, float>);
Custom_Op<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Custom_Op<DeviceType::OPENCL, half>);
Custom_Op<DeviceType::GPU, half>);
}
} // namespace ops
......
......@@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
//3. 声明设备类型
DeviceType device_type = DeviceType::OPENCL;
DeviceType device_type = DeviceType::GPU;
//4. 定义输入输出名称数组
std::vector<std::string> input_names = {...};
......@@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) {
//6. 创建MaceEngine对象
mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
//7. 如果设备类型是OPENCL或HEXAGON,可以在此释放model_data
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
//7. 如果设备类型是GPU或者HEXAGON,可以在此释放model_data
if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
......
......@@ -108,10 +108,8 @@ inline int64_t NowMicros() {
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("GPU") == 0) {
return DeviceType::GPU;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
......@@ -198,7 +196,7 @@ bool Run(MaceEngine *engine,
return true;
}
DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]");
DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
DEFINE_string(input_node, "input_node0,input_node1",
"input nodes, separated by comma");
DEFINE_string(output_node, "output_node0,output_node1",
......@@ -279,7 +277,7 @@ int Main(int argc, char **argv) {
mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
......@@ -347,7 +345,7 @@ int Main(int argc, char **argv) {
LOG(INFO) << "Run init";
std::unique_ptr<mace::MaceEngine> engine_ptr(
new mace::MaceEngine(&net_def, device_type, input_names, output_names));
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
......
......@@ -141,10 +141,8 @@ std::string FormatName(const std::string input) {
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("GPU") == 0) {
return DeviceType::GPU;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
......@@ -277,7 +275,7 @@ int Main(int argc, char **argv) {
FLAGS_gpu_model_data_file.c_str());
NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names,
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names,
output_names);
mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
......
......@@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) {
}
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
#endif
MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
......
......@@ -189,7 +189,7 @@ class Image : public BufferBase {
public:
Image()
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
allocator_(GetDeviceAllocator(GPU)),
buf_(nullptr),
mapped_buf_(nullptr) {}
......@@ -198,7 +198,7 @@ class Image : public BufferBase {
std::accumulate(
shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
allocator_(GetDeviceAllocator(GPU)),
mapped_buf_(nullptr) {
shape_ = shape;
data_type_ = data_type;
......
......@@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run(
input_tensors.push_back(input_tensor);
}
for (auto &output : *outputs) {
if (device_type_ == DeviceType::OPENCL) {
if (device_type_ == DeviceType::GPU) {
MACE_CHECK(output.second.shape().size() == 4,
"The outputs' shape must be 4-dimension with NHWC format,"
" please use 1 to fill missing dimensions");
......@@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run(
#endif
#ifdef MACE_ENABLE_OPENCL
if (device_type_ == OPENCL) {
if (device_type_ == GPU) {
OpenCLRuntime::Global()->SaveBuiltCLProgram();
}
#endif
......
......@@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL &&
bool future_wait = (device_type_ == DeviceType::GPU &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
......
......@@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry);
extern void Register_Eltwise(OperatorRegistry *op_registry);
extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
extern void Register_FullyConnected(OperatorRegistry *op_registry);
extern void Register_FusedConv2D(OperatorRegistry *op_registry);
extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
extern void Register_MatMul(OperatorRegistry *op_registry);
extern void Register_Pad(OperatorRegistry *op_registry);
......@@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
extern void Register_Proposal(OperatorRegistry *op_registry);
extern void Register_PSROIAlign(OperatorRegistry *op_registry);
extern void Register_Quantize(OperatorRegistry *op_registry);
extern void Register_ReOrganize(OperatorRegistry *op_registry);
extern void Register_Requantize(OperatorRegistry *op_registry);
extern void Register_Reshape(OperatorRegistry *op_registry);
extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
......@@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() {
ops::Register_Eltwise(this);
ops::Register_FoldedBatchNorm(this);
ops::Register_FullyConnected(this);
ops::Register_FusedConv2D(this);
ops::Register_LocalResponseNorm(this);
ops::Register_MatMul(this);
ops::Register_Pad(this);
......@@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() {
ops::Register_PSROIAlign(this);
ops::Register_Quantize(this);
ops::Register_Requantize(this);
ops::Register_ReOrganize(this);
ops::Register_Reshape(this);
ops::Register_ResizeBilinear(this);
ops::Register_Slice(this);
......
......@@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
VLOG(3) << "Model data size: " << model_data_size;
if (model_data_size > 0) {
if (type == DeviceType::CPU || type == DeviceType::NEON) {
if (type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type),
model_data_ptr,
......@@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
tensor_map_[const_tensor.name()] = std::move(tensor);
}
if (type == DeviceType::CPU || type == DeviceType::OPENCL) {
if (type == DeviceType::CPU || type == DeviceType::GPU) {
CreateOutputTensorBuffer(net_def, type);
}
}
......@@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
std::unique_ptr<BufferBase> image_buf(
new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
......@@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
tensor->SetSourceOpName(op.name());
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
......@@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
}
ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
if (device_type == CPU || device_type == NEON) {
if (device_type == CPU) {
return host_scratch_buffer_.get();
} else {
return nullptr;
......
......@@ -22,7 +22,7 @@
* --input_file=input_data \
* --output_file=mace.out \
* --model_data_file=model_data.data \
* --device=OPENCL
* --device=GPU
*/
#include <malloc.h>
#include <stdint.h>
......@@ -102,10 +102,8 @@ std::string FormatName(const std::string input) {
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("GPU") == 0) {
return DeviceType::GPU;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
......@@ -135,7 +133,7 @@ DEFINE_string(output_file,
DEFINE_string(model_data_file,
"",
"model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
......@@ -160,7 +158,7 @@ bool RunModel(const std::vector<std::string> &input_names,
MaceStatus res = mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
......@@ -178,7 +176,7 @@ bool RunModel(const std::vector<std::string> &input_names,
// Init model
mace::MaceEngine engine(&net_def, device_type, input_names,
output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
......
......@@ -162,7 +162,7 @@ class ActivationFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> {
class ActivationFunctor<DeviceType::GPU, T> {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
......
......@@ -93,7 +93,7 @@ struct AddNFunctor {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct AddNFunctor<DeviceType::OPENCL, T> {
struct AddNFunctor<DeviceType::GPU, T> {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future);
......
......@@ -128,7 +128,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
......
......@@ -65,7 +65,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct BiasAddFunctor<DeviceType::OPENCL, T> {
struct BiasAddFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
......
......@@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
};
template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
......
......@@ -67,7 +67,7 @@ struct ChannelShuffleFunctor {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
struct ChannelShuffleFunctor<DeviceType::GPU, T> {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
......@@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
......
......@@ -615,7 +615,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
......
......@@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
DepthToSpaceOpFunctor(const int block_size, bool d2s)
: block_size_(block_size), d2s_(d2s) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
......@@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
const Padding padding_type,
......
......@@ -363,7 +363,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff,
const float value)
......
......@@ -88,7 +88,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(const int /*BufferType*/ weight_type,
const ActivationType activation,
const float relux_max_limit)
......
......@@ -75,7 +75,7 @@ struct MatMulFunctor {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> {
struct MatMulFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
......
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
......@@ -46,7 +46,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -124,7 +124,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
template struct ActivationFunctor<DeviceType::OPENCL, float>;
template struct ActivationFunctor<DeviceType::OPENCL, half>;
template struct ActivationFunctor<DeviceType::GPU, float>;
template struct ActivationFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -22,7 +22,7 @@ namespace mace {
namespace kernels {
template <typename T>
void AddNFunctor<DeviceType::OPENCL, T>::operator()(
void AddNFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
......@@ -58,7 +58,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -119,9 +119,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct AddNFunctor<DeviceType::OPENCL, float>;
template struct AddNFunctor<DeviceType::GPU, float>;
template struct AddNFunctor<DeviceType::OPENCL, half>;
template struct AddNFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
......@@ -56,7 +56,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -130,7 +130,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
template struct BatchNormFunctor<DeviceType::OPENCL, float>;
template struct BatchNormFunctor<DeviceType::OPENCL, half>;
template struct BatchNormFunctor<DeviceType::GPU, float>;
template struct BatchNormFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -22,7 +22,7 @@ namespace mace {
namespace kernels {
template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
......@@ -49,7 +49,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -115,7 +115,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
template struct BiasAddFunctor<DeviceType::OPENCL, float>;
template struct BiasAddFunctor<DeviceType::OPENCL, half>;
template struct BiasAddFunctor<DeviceType::GPU, float>;
template struct BiasAddFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -20,7 +20,7 @@ namespace mace {
namespace kernels {
template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape;
......@@ -95,7 +95,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -177,8 +177,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
template struct BufferToImageFunctor<DeviceType::GPU, float>;
template struct BufferToImageFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
......@@ -56,7 +56,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -108,8 +108,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
}
template
struct ChannelShuffleFunctor<DeviceType::OPENCL, float>;
struct ChannelShuffleFunctor<DeviceType::GPU, float>;
template
struct ChannelShuffleFunctor<DeviceType::OPENCL, half>;
struct ChannelShuffleFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel,
}
template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
void ConcatFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
......@@ -264,8 +264,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct ConcatFunctor<DeviceType::OPENCL, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>;
template struct ConcatFunctor<DeviceType::GPU, float>;
template struct ConcatFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error);
template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
......@@ -128,8 +128,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
template struct Conv2dFunctor<DeviceType::OPENCL, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>;
template struct Conv2dFunctor<DeviceType::GPU, float>;
template struct Conv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......
......@@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......
......@@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
......@@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, half>;
template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
template struct DepthToSpaceOpFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
}
template <typename T>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
&kwg_size_, &kernel_error_);
}
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>;
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -21,7 +21,7 @@ namespace mace {
namespace kernels {
template <typename T>
void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
......@@ -74,7 +74,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -129,7 +129,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
}
}
template struct EltwiseFunctor<DeviceType::OPENCL, float>;
template struct EltwiseFunctor<DeviceType::OPENCL, half>;
template struct EltwiseFunctor<DeviceType::GPU, float>;
template struct EltwiseFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
......@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel,
} // namespace
template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
......@@ -305,9 +305,9 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template struct FullyConnectedFunctor<DeviceType::GPU, float>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
template struct FullyConnectedFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -21,7 +21,7 @@ namespace mace {
namespace kernels {
template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
......@@ -53,7 +53,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -98,9 +98,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
}
}
template struct MatMulFunctor<DeviceType::OPENCL, float>;
template struct MatMulFunctor<DeviceType::GPU, float>;
template struct MatMulFunctor<DeviceType::OPENCL, half>;
template struct MatMulFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap();
......@@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std::vector<index_t> buffer_shape = {batch, height, width, channels};
Workspace ws;
Tensor *buffer = ws.CreateTensor("Buffer",
GetDeviceAllocator(DeviceType::OPENCL),
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape);
std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image",
GetDeviceAllocator(DeviceType::OPENCL),
GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape);
......
......@@ -21,7 +21,7 @@ namespace mace {
namespace kernels {
template<typename T>
void PadFunctor<DeviceType::OPENCL, T>::operator()(
void PadFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
......@@ -59,7 +59,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -115,9 +115,9 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
}
template
struct PadFunctor<DeviceType::OPENCL, float>;
struct PadFunctor<DeviceType::GPU, float>;
template
struct PadFunctor<DeviceType::OPENCL, half>;
struct PadFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -22,7 +22,7 @@ namespace mace {
namespace kernels {
template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
......@@ -50,7 +50,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -148,7 +148,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
template struct PoolingFunctor<DeviceType::OPENCL, float>;
template struct PoolingFunctor<DeviceType::OPENCL, half>;
template struct PoolingFunctor<DeviceType::GPU, float>;
template struct PoolingFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
......@@ -50,7 +50,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -113,8 +113,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
template struct ResizeBilinearFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -21,7 +21,7 @@ namespace mace {
namespace kernels {
template<typename T>
void SliceFunctor<DeviceType::OPENCL, T>::operator()(
void SliceFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) {
......@@ -51,7 +51,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -133,9 +133,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
}
template
struct SliceFunctor<DeviceType::OPENCL, float>;
struct SliceFunctor<DeviceType::GPU, float>;
template
struct SliceFunctor<DeviceType::OPENCL, half>;
struct SliceFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -23,7 +23,7 @@ namespace mace {
namespace kernels {
template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output,
StatsFuture *future) {
const index_t batch = logits->dim(0);
......@@ -49,7 +49,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -95,7 +95,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
}
}
template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
template struct SoftmaxFunctor<DeviceType::GPU, float>;
template struct SoftmaxFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -25,7 +25,7 @@ namespace mace {
namespace kernels {
template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
......@@ -60,7 +60,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -120,8 +120,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
template struct SpaceToBatchFunctor<DeviceType::OPENCL, half>;
template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
template struct SpaceToBatchFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......
......@@ -22,7 +22,7 @@ namespace mace {
namespace kernels {
template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
......@@ -39,7 +39,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -117,7 +117,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
}
template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
......@@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
......@@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradTransformFunctor<DeviceType::GPU, float>;
template struct WinogradTransformFunctor<DeviceType::GPU, half>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradInverseTransformFunctor<DeviceType::GPU, float>;
template struct WinogradInverseTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase {
std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channel = input->dim(3);
const index_t channel = input->dim(1);
const index_t height = input->dim(2);
const index_t width = input->dim(3);
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
const index_t in_offset = (((b * height + h) * width) + w) * channel;
for (index_t c = 0; c < channel; ++c) {
for (index_t h = 0; h < height; ++h) {
const index_t in_offset = (((b * channel + c) * height) + h) * width;
const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
+ (h + this->paddings_[2])) * output->dim(2)
+ (w + this->paddings_[4])) * output->dim(3)
+ (c + this->paddings_[2])) * output->dim(2)
+ (h + this->paddings_[4])) * output->dim(3)
+ this->paddings_[6];
memcpy(output_ptr + out_offset,
input_ptr + in_offset,
channel * sizeof(T));
width * sizeof(T));
}
}
}
......@@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase {
struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
PadFunctor(const std::vector<int> &paddings,
const float constant_value)
: PadFunctorBase(paddings, constant_value) {}
......
......@@ -261,7 +261,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_REORGANIZE_H_
#define MACE_KERNELS_REORGANIZE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct ReOrganizeFunctor {
void operator()(const Tensor *input,
const std::vector<index_t> &out_shape,
Tensor *output,
StatsFuture *future) {
const bool w2c = out_shape[3] > input->dim(3);
const index_t height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_chan = input->dim(3);
const index_t output_width = output->dim(2);
const index_t output_chan = output->dim(3);
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
if (w2c) {
MACE_CHECK((out_shape[3] % input->dim(3)) == 0);
const index_t multiplier = out_shape[3] / input->dim(3);
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < out_shape[0]; ++n) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
for (index_t c = 0; c < out_shape[3]; ++c) {
const index_t out_offset =
((n * height + h) * output_width + w)
* output_chan + c;
const index_t in_w_idx = w + (c % multiplier) * output_width;
const index_t in_chan_idx = c / multiplier;
const index_t in_offset =
((n * height + h) * input_width + in_w_idx)
* input_chan + in_chan_idx;
output_ptr[out_offset] = input_ptr[in_offset];
}
}
}
}
} else {
MACE_CHECK((input->dim(3) % out_shape[3]) == 0);
const index_t multiplier = input->dim(3) / out_shape[3];
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < out_shape[0]; ++n) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
for (index_t c = 0; c < out_shape[3]; ++c) {
const index_t out_offset =
((n * height + h) * output_width + w)
* output_chan + c;
const index_t in_w_idx = w % input_width;
const index_t in_chan_idx = w / input_width + c * multiplier;
const index_t in_offset =
((n * height + h) * input_width + in_w_idx)
* input_chan + in_chan_idx;
output_ptr[out_offset] = input_ptr[in_offset];
}
}
}
}
}
}
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_REORGANIZE_H_
......@@ -179,7 +179,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
struct ResizeBilinearFunctor<DeviceType::GPU, T>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
......
......@@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
struct SliceFunctor<DeviceType::GPU, T> : SliceFunctorBase {
explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
void operator()(const Tensor *input,
......
......@@ -94,7 +94,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct SoftmaxFunctor<DeviceType::OPENCL, T> {
struct SoftmaxFunctor<DeviceType::GPU, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
......
......@@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s)
......
......@@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T>
struct WinogradTransformFunctor<DeviceType::GPU, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
......@@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
......
......@@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::OPENCL, float>);
ActivationOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ActivationOp<DeviceType::OPENCL, half>);
ActivationOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -33,7 +33,7 @@ void ReluBenchmark(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -45,7 +45,7 @@ void ReluBenchmark(
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -83,8 +83,8 @@ void ReluBenchmark(
#define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
BM_RELU_MACRO(N, C, H, W, half, OPENCL);
BM_RELU_MACRO(N, C, H, W, float, GPU); \
BM_RELU_MACRO(N, C, H, W, half, GPU);
BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128);
......@@ -107,7 +107,7 @@ void ReluxBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
}
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -151,8 +151,8 @@ void ReluxBenchmark(
#define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
BM_RELUX_MACRO(N, C, H, W, float, GPU); \
BM_RELUX_MACRO(N, C, H, W, half, GPU);
BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128);
......@@ -171,7 +171,7 @@ void PreluBenchmark(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -185,7 +185,7 @@ void PreluBenchmark(
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
......@@ -226,8 +226,8 @@ void PreluBenchmark(
#define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
BM_PRELU_MACRO(N, C, H, W, float, GPU); \
BM_PRELU_MACRO(N, C, H, W, half, GPU);
BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128);
......@@ -250,7 +250,7 @@ void TanhBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
}
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -292,8 +292,8 @@ void TanhBenchmark(
#define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
BM_TANH_MACRO(N, C, H, W, half, OPENCL);
BM_TANH_MACRO(N, C, H, W, float, GPU); \
BM_TANH_MACRO(N, C, H, W, half, GPU);
BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128);
......@@ -316,7 +316,7 @@ void SigmoidBenchmark(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
}
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -359,8 +359,8 @@ void SigmoidBenchmark(
#define BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \
BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL);
BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
BM_SIGMOID(1, 1, 512, 512);
BM_SIGMOID(1, 3, 128, 128);
......
......@@ -31,7 +31,7 @@ void TestSimpleRelu() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -68,7 +68,7 @@ void TestSimpleRelu() {
TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
TestSimpleRelu<DeviceType::OPENCL>();
TestSimpleRelu<DeviceType::GPU>();
}
namespace {
......@@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
}
TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu<DeviceType::OPENCL>();
TestUnalignedSimpleRelu<DeviceType::GPU>();
}
......@@ -131,7 +131,7 @@ void TestSimpleRelux() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -170,7 +170,7 @@ void TestSimpleRelux() {
TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimple) {
TestSimpleRelux<DeviceType::OPENCL>();
TestSimpleRelux<DeviceType::GPU>();
}
namespace {
......@@ -183,7 +183,7 @@ void TestSimpleReluRelux() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
}
TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
TestSimpleReluRelux<DeviceType::OPENCL>();
TestSimpleReluRelux<DeviceType::GPU>();
}
namespace {
......@@ -238,7 +238,7 @@ void TestSimplePrelu() {
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
......@@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) {
}
TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::OPENCL>();
TestSimplePrelu<DeviceType::GPU>();
}
namespace {
......@@ -296,7 +296,7 @@ void TestSimpleTanh() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -336,7 +336,7 @@ void TestSimpleTanh() {
TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
TestSimpleTanh<DeviceType::OPENCL>();
TestSimpleTanh<DeviceType::GPU>();
}
namespace {
......@@ -349,7 +349,7 @@ void TestSimpleSigmoid() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
}
TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
TestSimpleSigmoid<DeviceType::OPENCL>();
TestSimpleSigmoid<DeviceType::GPU>();
}
} // namespace test
......
......@@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::OPENCL, float>);
AddNOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::OPENCL, half>);
AddNOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
}
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
MakeString("InputImage", i).c_str(),
......@@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
BM_ADDN(2, 1, 256, 256, 32);
BM_ADDN(2, 1, 128, 128, 32);
......
......@@ -64,7 +64,7 @@ void SimpleAdd3() {
{-0.1582, 2, 3, 4, 5, 6});
const int input_num = 4;
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
......@@ -105,7 +105,7 @@ void SimpleAdd3() {
} // namespace
TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::OPENCL>(); }
TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::GPU>(); }
namespace {
template <DeviceType D>
......@@ -166,7 +166,7 @@ void RandomTest() {
}
} // namespace
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::GPU>(); }
} // namespace test
} // namespace ops
......
......@@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, float>);
BatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, half>);
BatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -32,7 +32,7 @@ void BatchNorm(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -52,7 +52,7 @@ void BatchNorm(
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
......@@ -107,8 +107,8 @@ void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
BM_BATCH_NORM(1, 1, 512, 512);
BM_BATCH_NORM(1, 3, 128, 128);
......
......@@ -49,7 +49,7 @@ void Simple() {
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
......@@ -90,7 +90,7 @@ void Simple() {
TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// generate random input
......@@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
......@@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
......@@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
}
......@@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
......@@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
......@@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
}
......
......@@ -20,15 +20,15 @@ namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, float>);
BatchToSpaceNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::OPENCL, half>);
BatchToSpaceNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -63,7 +63,7 @@ void BMBatchToSpace(
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, OPENCL);
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
......
......@@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::OPENCL, float>);
BiasAddOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BiasAddOp<DeviceType::OPENCL, half>);
BiasAddOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
.Input("Bias")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
......@@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
#define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL);
BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
BM_BIAS_ADD(1, 1, 512, 512);
BM_BIAS_ADD(1, 3, 128, 128);
......
......@@ -47,7 +47,7 @@ void BiasAddSimple() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
......@@ -80,7 +80,7 @@ void BiasAddSimple() {
TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
BiasAddSimple<DeviceType::OPENCL>();
BiasAddSimple<DeviceType::GPU>();
}
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
......@@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest")
......@@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
......@@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest")
......@@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
}
......
......@@ -19,16 +19,16 @@ namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, float>);
BufferToImageOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, half>);
BufferToImageOp<DeviceType::GPU, half>);
}
} // namespace ops
......
......@@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type,
} // namespace
TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
}
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 3, 5});
}
TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 13, 17});
}
TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{1, 1, 128, 512});
}
TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 3, 5});
}
TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 13, 17});
}
TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
{3, 3, 128, 256});
}
......@@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type,
} // namespace
TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
{11});
}
......@@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40,
};
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
kernels::ARGUMENT, {2}, input_data);
}
......
......@@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::OPENCL, float>);
ChannelShuffleOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ChannelShuffleOp<DeviceType::OPENCL, half>);
ChannelShuffleOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator<D, T> {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
int channels;
if (D == OPENCL) {
if (D == GPU) {
channels = input->dim(3);
} else if (D == CPU) {
channels = input->dim(1);
......
......@@ -31,7 +31,7 @@ void ChannelShuffle(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -42,7 +42,7 @@ void ChannelShuffle(
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -82,8 +82,8 @@ void ChannelShuffle(
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL);
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
......
......@@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
OpsTestNet net;
// Add input data
net.AddInputFromArray<DeviceType::OPENCL, float>(
net.AddInputFromArray<DeviceType::GPU, float>(
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
......@@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
// Check
......
......@@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::OPENCL, float>);
ConcatOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ConcatOp<DeviceType::OPENCL, half>);
ConcatOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0);
net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1);
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImage0",
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImage1",
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0")
......@@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters,
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
}
const int64_t tot =
......@@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters,
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
}
}
} // namespace
......
......@@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::OPENCL, float>(input_name,
net.AddInputFromArray<DeviceType::GPU, float>(input_name,
shapes[i], inputs[i]);
BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name,
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL);
}
......@@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
// Check
......
......@@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, float>);
Conv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
Conv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -16,6 +16,7 @@
#define MACE_OPS_CONV_2D_H_
#include <memory>
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h"
......@@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->padding_type_,
this->paddings_,
this->dilations_.data(),
kernels::ActivationType::NOOP,
0.0f,
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {}
......
......@@ -47,7 +47,7 @@ void Conv2d(int iters,
{output_channels, channels, kernel_h,
kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, output_channels,
......@@ -68,7 +68,7 @@ void Conv2d(int iters,
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -138,8 +138,8 @@ void Conv2d(int iters,
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
......
......@@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) {
}
TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
TestNHWCSimple3x3VALID<DeviceType::GPU, float>();
TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
}
namespace {
......@@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) {
}
TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
TestNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
}
namespace {
......@@ -301,7 +301,7 @@ void TestNHWCCombined3x3() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) {
}
TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::OPENCL, float>();
TestNHWCCombined3x3<DeviceType::GPU, float>();
}
namespace {
template<DeviceType D, typename T>
void TestFusedNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
template<DeviceType D, typename T>
void TestFusedNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
} // namespace
TEST_F(Conv2dOpTest, FusedCPUSimple) {
TestFusedNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestFusedNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>();
TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
}
namespace {
template<DeviceType D>
void TestConv1x1() {
......@@ -389,7 +544,7 @@ void TestConv1x1() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Filter", "FilterImage",
......@@ -431,7 +586,7 @@ void TestConv1x1() {
TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
namespace {
template<DeviceType D, typename T>
......@@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1);
TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
}
namespace {
......@@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16},
{4, 4});
}
......@@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
TestDilationConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, 2);
}
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
TestDilationConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, 4);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
TestDilationConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, 4);
}
namespace {
template<DeviceType D>
void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t output_channels = filter_shape[2];
index_t input_channels = filter_shape[3];
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", dilations)
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", dilations)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
};
func(1, 1, VALID);
func(1, 1, SAME);
}
} // namespace
TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
{2, 2});
}
namespace {
......@@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
} // namespace
TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
TestArbitraryPadConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4});
}
} // namespace test
......
......@@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) {
.Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input =
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::OPENCL),
ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3});
{
......@@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) {
}
std::shared_ptr<OperatorRegistry> op_registry(new OperatorRegistry());
auto net =
CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL, NetMode::INIT);
CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
net->Run();
EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
net = CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL);
net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
net->Run();
EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
......
......@@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthToSpaceOp<DeviceType::OPENCL, float>);
DepthToSpaceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthToSpaceOp<DeviceType::OPENCL, half>);
DepthToSpaceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator<D, T> {
int input_depth;
if (D == CPU) {
input_depth = input->dim(1);
} else if (D == OPENCL) {
} else if (D == GPU) {
input_depth = input->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -31,7 +31,7 @@ void DepthToSpace(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -42,7 +42,7 @@ void DepthToSpace(
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -82,8 +82,8 @@ void DepthToSpace(
#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL);
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
......
......@@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s,
}
if (D == DeviceType::OPENCL) {
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
}
auto expected = CreateTensor<float>(expected_shape, expected_data);
......@@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
}
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 4, 4},
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2,
......@@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
}
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 2, 4},
RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
......@@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
}
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 2, 16},
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2,
......@@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
}
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 1, 16},
RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16},
2,
......@@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0);
RunDepthToSpace<DeviceType::OPENCL>(true, {1, 192, 192, 128},
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128},
in,
2,
{1, 384, 384, 32},
......@@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size,
} // namespace
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128});
RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128});
}
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::OPENCL, half>(true, 2, {1, 192, 192, 128});
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::OPENCL, float>(false, 2, {1, 384, 384, 32});
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::OPENCL, half>(false, 2, {1, 384, 384, 32});
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
}
} // namespace test
......
......@@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, float>);
DepthwiseConv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, half>);
DepthwiseConv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters,
net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
......@@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters,
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters,
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
......
......@@ -59,7 +59,7 @@ void SimpleValidTest() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
}
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
SimpleValidTest<DeviceType::OPENCL, float>();
SimpleValidTest<DeviceType::GPU, float>();
}
TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
SimpleValidTest<DeviceType::OPENCL, half>();
SimpleValidTest<DeviceType::GPU, half>();
}
namespace {
......@@ -184,7 +184,7 @@ void ComplexValidTest() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
......@@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
}
TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
ComplexValidTest<DeviceType::OPENCL, float>();
ComplexValidTest<DeviceType::GPU, float>();
}
TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
ComplexValidTest<DeviceType::OPENCL, half>();
ComplexValidTest<DeviceType::GPU, half>();
}
namespace {
......@@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input",
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width,
input_channels});
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias",
net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier
* input_channels});
......@@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) {
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Filter", "FilterImage",
BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
kernels::BufferType::DW_CONV2D_FILTER);
BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
......@@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) {
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net,
ImageToBuffer<DeviceType::GPU, float>(&net,
"OutputImage",
"DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
......
......@@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
EltwiseOp<DeviceType::OPENCL, float>);
EltwiseOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
EltwiseOp<DeviceType::OPENCL, half>);
EltwiseOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -34,11 +34,11 @@ void EltwiseBenchmark(
net.AddRandomInput<D, T>("Input0", {n, h, w, c});
net.AddRandomInput<D, T>("Input1", {n, h, w, c});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
if (D == DeviceType::GPU) {
BufferToImage<D, half>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0")
.Input("InputImg1")
......@@ -90,8 +90,8 @@ void EltwiseBenchmark(
#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL);
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
BM_ELTWISE(2, 1, 128, 128, 32);
BM_ELTWISE(2, 1, 240, 240, 256);
......
......@@ -183,49 +183,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
}
TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1,
{2});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
2,
{2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3},
{2, 4, 6, 8, 10, 12},
2,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::ABS,
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 4, 9, 16, 25});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
{1, 1, 1, 1}, {1}, 1,
{2});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 2, 3, 4, 5});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
2,
{2, 4, 6, 8, 10, 12});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
{1, 1, 2, 3},
{2, 4, 6, 8, 10, 12},
2,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{1, 1, 1, 1, 1, 1});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{3, 3, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
3,
{-1, -2, -3, -4, -5, -6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS,
{1, 1, 2, 3},
{-1, -2, -3, -4, -5, -6},
3,
{1, 2, 3, 4, 5, 6});
SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
1,
{0, 1, 4, 9, 16, 25});
}
TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
......@@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
}
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 3}, {1, 2, 3},
{2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD,
{1, 1, 1, 3}, {1, 2, 3},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 1, 1, 1, 5},
{1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 1, 1, 5}, {1, 1, 1, 2, 4},
{1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
......@@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUM,
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
{0.2, 0.4, 0.6, 0.8, 1, 1.2},
{0.1, 0.1});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SUB,
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {1, 2, 3, 4, 5},
{0, 0, 0, 0, 0});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::PROD,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::DIV,
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
{1, 1, 1, 1, 1, 1});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MIN,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::MAX,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::OPENCL, float>(
SimpleTensorEltwise<DeviceType::GPU, float>(
kernels::EltwiseType::SQR_DIFF,
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
......@@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", shape, true, true);
net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImg",
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg")
......@@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2);
}
}
......@@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0, true, true);
net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1, true, true);
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC,
"TInput0", NCHW);
......@@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImg0",
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImg1",
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg0")
......@@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
} else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
1e-2);
}
}
......@@ -609,19 +609,19 @@ TEST_F(EltwiseOpTest, RandomTensorTensorFloat) {
TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
RandomTensorEltwise<half>(kernels::EltwiseType::SUM,
{1, 32, 32, 16}, {1, 32, 32, 16});
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
{3, 32, 32, 16}, {3, 32, 32, 16});
{3, 32, 32, 16}, {3, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::PROD,
{1, 31, 37, 17}, {1, 31, 37, 17});
{1, 31, 37, 17}, {1, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
{5, 31, 37, 17}, {5, 31, 37, 17});
{5, 31, 37, 17}, {5, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::MIN,
{1, 32, 32, 16}, {1, 32, 32, 16});
{1, 32, 32, 16}, {1, 32, 32, 16});
RandomTensorEltwise<half>(kernels::EltwiseType::MAX,
{3, 31, 37, 17}, {3, 31, 37, 17});
{3, 31, 37, 17}, {3, 31, 37, 17});
RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF,
{3, 31, 37, 17}, {3, 31, 37, 17});
{3, 31, 37, 17}, {3, 31, 37, 17});
}
......
......@@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>);
FoldedBatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>);
FoldedBatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -60,7 +60,7 @@ void Simple() {
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
......@@ -94,7 +94,7 @@ void Simple() {
TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// generate random input
......@@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
......@@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
......@@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
}
......@@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
......@@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
}
......@@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
......@@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
......@@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
}
......
......@@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::OPENCL, float>);
FullyConnectedOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FullyConnectedOp<DeviceType::OPENCL, half>);
FullyConnectedOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -43,7 +43,7 @@ void FCBenchmark(
.Input("Bias")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
BufferToImage<D, T>(&net, "Weight", "WeightImage",
weight_type);
......@@ -93,8 +93,8 @@ void FCBenchmark(
#define BM_FC(N, H, W, C, OC) \
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
BM_FC_MACRO(N, H, W, C, OC, float, OPENCL); \
BM_FC_MACRO(N, H, W, C, OC, half, OPENCL);
BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
BM_FC_MACRO(N, H, W, C, OC, half, GPU);
BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000);
......
......@@ -51,7 +51,7 @@ void Simple(const std::vector<index_t> &input_shape,
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Weight", "WeightImage",
......@@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
}
TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
Simple<DeviceType::OPENCL>(
Simple<DeviceType::GPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::OPENCL>(
Simple<DeviceType::GPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
......@@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
}
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
}
......@@ -136,11 +136,11 @@ void Complex(const index_t batch,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest")
.Input("Input")
......@@ -159,11 +159,11 @@ void Complex(const index_t batch,
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
kernels::BufferType::WEIGHT_HEIGHT);
BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
......@@ -176,9 +176,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
......@@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch,
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest")
.Input("Input")
......@@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch,
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
kernels::BufferType::WEIGHT_WIDTH);
BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
......@@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/fused_conv_2d.h"
namespace mace {
namespace ops {
void Register_FusedConv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, half>);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_FUSED_CONV_2D_H_
#define MACE_OPS_FUSED_CONV_2D_H_
#include <memory>
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h"
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
public:
FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws),
functor_(this->strides_.data(),
this->padding_type_,
this->paddings_,
this->dilations_.data(),
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output, future);
return true;
}
private:
kernels::Conv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FUSED_CONV_2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "mace/ops/fused_conv_2d.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FusedConv2dOpTest : public OpsTestBase {};
namespace {
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
} // namespace
TEST_F(FusedConv2dOpTest, CPUSimple) {
TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
}
TEST_F(FusedConv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
}
namespace {
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
} // namespace
TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
}
TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
}
namespace {
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {1, 1, 2, 5},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
}
} // namespace
TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
namespace {
template<DeviceType D, typename T>
void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
// generate random input
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand_r(&seed) % 10);
index_t output_channels = shape[3] + (rand_r(&seed) % 10);
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
} // namespace
TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
namespace {
template<DeviceType D>
void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape,
const int kernel, const int stride,
Padding type) {
testing::internal::LogToStderr();
// generate random input
srand(time(NULL));
index_t batch = 3;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
OpsTestNet net;
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels},
&float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData(
{kernel, kernel, output_channels, input_channels},
&float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data
net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel, kernel, output_channels, input_channels},
float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
}
} // namespace
TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 1, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 1, SAME);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 2, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 2, SAME);
}
TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 1, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 1, SAME);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 2, VALID);
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 2, SAME);
}
namespace {
template<DeviceType D, typename T>
void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t output_channels = filter_shape[2];
index_t input_channels = filter_shape[3];
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
};
for (int stride : {1, 2}) {
func(stride, stride, VALID);
func(stride, stride, SAME);
}
}
} // namespace
TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) {
TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 64, 3});
}
TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 64, 32});
}
namespace {
template<DeviceType D, typename T>
void TestAtrousConvNxN(const std::vector<index_t> &shape,
const int dilation) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t output_channels = shape[2];
index_t input_channels = shape[3];
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
};
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
} // namespace
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
}
TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
}
TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
}
namespace {
template<DeviceType D>
void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
index_t output_channels = filter_shape[2];
index_t input_channels = filter_shape[3];
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Filter",
HWOI,
"FilterOIHW",
OIHW);
// Construct graph
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"Output",
NHWC);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-1);
};
func(1, 1, VALID);
func(1, 1, SAME);
}
} // namespace
TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 16, 3},
{2, 2});
}
TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, {15, 15, 16, 16},
{2, 2});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -19,16 +19,16 @@ namespace ops {
void Register_ImageToBuffer(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, float>);
ImageToBufferOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ImageToBufferOp<DeviceType::OPENCL, half>);
ImageToBufferOp<DeviceType::GPU, half>);
}
} // namespace ops
......
......@@ -26,16 +26,16 @@ void Register_MatMul(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::OPENCL, float>);
MatMulOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
MatMulOp<DeviceType::OPENCL, half>);
MatMulOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -34,7 +34,7 @@ void MatMulBenchmark(
net.AddRandomInput<D, float>("A", {batch, height, channels, 1});
net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
......@@ -79,8 +79,8 @@ void MatMulBenchmark(
#define BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);
BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
BM_MATMUL_MACRO(N, H, C, W, half, GPU);
BM_MATMUL(16, 32, 128, 49);
BM_MATMUL(16, 32, 128, 961);
......
......@@ -37,7 +37,7 @@ void Simple(const std::vector<index_t> &A_shape,
net.AddInputFromArray<D, float>("A", A_shape, A_value);
net.AddInputFromArray<D, float>("B", B_shape, B_value);
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, float>(&net, "B", "BImage",
......@@ -91,10 +91,10 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
}
TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
{22, 28, 49, 64});
Simple<DeviceType::OPENCL>(
Simple<DeviceType::GPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
......@@ -127,9 +127,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("A",
net.AddRandomInput<DeviceType::GPU, float>("A",
{batch, height, channels, 1});
net.AddRandomInput<DeviceType::OPENCL, float>(
net.AddRandomInput<DeviceType::GPU, float>(
"B", {batch, channels, out_width, 1});
// run cpu
......@@ -140,9 +140,9 @@ void Complex(const index_t batch,
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, T>(&net, "A", "AImage",
BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<DeviceType::OPENCL, T>(&net, "B", "BImage",
BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest")
......@@ -153,9 +153,9 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
......
......@@ -403,7 +403,7 @@ class OpsTestNet {
}
void Sync() {
if (net_ && device_ == DeviceType::OPENCL) {
if (net_ && device_ == DeviceType::GPU) {
OpenCLRuntime::Global()->command_queue().finish();
}
}
......
......@@ -26,15 +26,15 @@ void Register_Pad(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PadOp<DeviceType::OPENCL, float>);
PadOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PadOp<DeviceType::OPENCL, half>);
PadOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -33,7 +33,7 @@ void Pad(int iters, int batch, int height,
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pad", "PadTest")
......@@ -77,8 +77,8 @@ void Pad(int iters, int batch, int height,
#define BM_PAD(N, H, W, C, PAD) \
BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
BM_PAD_MACRO(N, H, W, C, PAD, float, OPENCL); \
BM_PAD_MACRO(N, H, W, C, PAD, half, OPENCL);
BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \
BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
BM_PAD(1, 512, 512, 1, 2);
BM_PAD(1, 112, 112, 64, 1);
......
......@@ -29,7 +29,7 @@ void Simple() {
// Add input data
net.AddRepeatedInput<D, float>("Input", {1, 2, 3, 1}, 2);
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pad", "PadTest")
......@@ -45,15 +45,24 @@ void Simple() {
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("Input")
.Output("Output")
.AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
.Input("TInput")
.Output("TOutput")
.AddIntsArg("paddings", {0, 0, 0, 0, 1, 2, 1, 2})
.AddFloatArg("constant_value", 1.0)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
NHWC);
}
auto output = net.GetTensor("Output");
......@@ -75,7 +84,7 @@ TEST_F(PadTest, SimpleCPU) {
}
TEST_F(PadTest, SimpleGPU) {
Simple<DeviceType::OPENCL>();
Simple<DeviceType::GPU>();
}
TEST_F(PadTest, ComplexCPU) {
......@@ -84,15 +93,23 @@ TEST_F(PadTest, ComplexCPU) {
// Add input data
net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("Input")
.Output("Output")
.Input("TInput")
.Output("TOutput")
.AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1})
.AddFloatArg("constant_value", 1.0)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
NHWC);
auto output = net.GetTensor("Output");
......@@ -109,39 +126,48 @@ TEST_F(PadTest, ComplexCPU) {
namespace {
template <typename T>
void Complex(const std::vector<index_t> &input_shape,
const std::vector<int> &paddings) {
const std::vector<int> &cpu_paddings,
const std::vector<int> &gpu_paddings) {
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>("Input", input_shape);
net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"TInput",
NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("Input")
.Output("Output")
.AddIntsArg("paddings", paddings)
.Input("TInput")
.Output("TOutput")
.AddIntsArg("paddings", cpu_paddings)
.AddFloatArg("constant_value", 1.0)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
NCHW,
"Output",
NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pad", "PadTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", paddings)
.AddIntsArg("paddings", gpu_paddings)
.AddFloatArg("constant_value", 1.0)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OpenCLOutput",
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
auto output = net.GetTensor("OpenCLOutput");
......@@ -155,15 +181,21 @@ void Complex(const std::vector<index_t> &input_shape,
} // namespace
TEST_F(PadTest, ComplexFloat) {
Complex<float>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<float>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<float>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
Complex<float>({1, 32, 32, 4},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<float>({1, 31, 37, 16},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<float>({1, 128, 128, 32},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
}
TEST_F(PadTest, ComplexHalf) {
Complex<half>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<half>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<half>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
Complex<half>({1, 32, 32, 4},
{0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
Complex<half>({1, 31, 37, 16},
{0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
Complex<half>({1, 128, 128, 32},
{0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
}
} // namespace test
......
......@@ -26,16 +26,16 @@ void Register_Pooling(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, float>);
PoolingOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, half>);
PoolingOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -41,7 +41,7 @@ void Pooling(int iters,
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input",
{batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input",
{batch, height, width, channels});
} else {
......@@ -58,7 +58,7 @@ void Pooling(int iters,
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -104,7 +104,7 @@ void Pooling(int iters,
#define BM_POOLING(N, C, H, W, K, S, PA, PO) \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, OPENCL);
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);
......
......@@ -211,7 +211,7 @@ void SimpleMaxPooling3S2() {
NCHW,
"Output",
NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
......@@ -238,7 +238,7 @@ void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>();
SimpleMaxPooling3S2<GPU>();
}
namespace {
......@@ -304,24 +304,24 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
} // namespace
TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) {
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, AVG_VALID) {
......@@ -400,7 +400,7 @@ void SimpleAvgPoolingTest() {
} // namespace
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>();
SimpleAvgPoolingTest<GPU>();
}
namespace {
......@@ -468,43 +468,43 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
} // namespace
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4},
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4},
Padding::VALID);
AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::VALID);
AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::VALID);
AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::SAME);
}
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/reorganize.h"
namespace mace {
namespace ops {
void Register_ReOrganize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReOrganize")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ReOrganizeOp<DeviceType::CPU, float>);
}
} // namespace ops
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REORGANIZE_H_
#define MACE_OPS_REORGANIZE_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/reorganize.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ReOrganizeOp : public Operator<D, T> {
public:
ReOrganizeOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const index_t num_dims = shape_.size();
int unknown_idx = -1;
index_t product = 1;
std::vector<index_t> out_shape;
for (int i = 0; i < num_dims; ++i) {
if (shape_[i] == -1) {
MACE_CHECK(unknown_idx == -1) << "Only one input size may be -1";
unknown_idx = i;
out_shape.push_back(1);
} else {
MACE_CHECK(shape_[i] >= 0) << "Shape must be non-negative: "
<< shape_[i];
out_shape.push_back(shape_[i]);
product *= shape_[i];
}
}
if (unknown_idx != -1) {
MACE_CHECK(product != 0)
<< "Cannot infer shape if there is zero shape size.";
const index_t missing = input->size() / product;
MACE_CHECK(missing * product == input->size())
<< "Input size not match reshaped tensor size";
out_shape[unknown_idx] = missing;
}
Tensor *output = this->Output(OUTPUT);
output->Resize(out_shape);
functor_(input, out_shape, output, future);
return true;
}
private:
std::vector<int64_t> shape_;
kernels::ReOrganizeFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REORGANIZE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gmock/gmock.h"
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class ReOrganizeTest : public OpsTestBase {};
namespace {
void TestReOrganize(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const std::vector<index_t> &output_shape,
const std::vector<float> &output_data) {
const std::vector<int> out_shape(output_shape.begin(), output_shape.end());
// Construct graph
OpsTestNet net;
OpDefBuilder("ReOrganize", "ReOrganizeTest")
.Input("Input")
.Output("Output")
.AddIntsArg("shape", out_shape)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input",
input_shape, input_data);
// Run
net.RunOp();
auto output = net.GetTensor("Output");
EXPECT_THAT(output->shape(), ::testing::ContainerEq(output_shape));
const float *output_ptr = output->data<float>();
int size = output->size();
for (int i = 0; i < size; ++i) {
ASSERT_EQ(output_data[i], output_ptr[i]) << "With Index " << i;
}
// Reverse reorganzie
const std::vector<int> in_shape(input_shape.begin(), input_shape.end());
OpDefBuilder("ReOrganize", "ReOrganizeTest")
.Input("Input")
.Output("Output")
.AddIntsArg("shape", in_shape)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input",
output_shape, output_data);
// Run
net.RunOp();
output = net.GetTensor("Output");
EXPECT_THAT(output->shape(), ::testing::ContainerEq(input_shape));
output_ptr = output->data<float>();
size = output->size();
for (int i = 0; i < size; ++i) {
ASSERT_EQ(input_data[i], output_ptr[i]) << "With Index " << i;
}
}
} // namespace
TEST_F(ReOrganizeTest, Simple) {
TestReOrganize({1, 1, 4, 6},
{0, 4, 8, 12, 16, 20,
1, 5, 9, 13, 17, 21,
2, 6, 10, 14, 18, 22,
3, 7, 11, 15, 19, 23},
{1, 1, 8, 3},
{0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19,
4, 12, 20, 5, 13, 21, 6, 14, 22, 7, 15, 23});
TestReOrganize({1, 1, 5, 6},
{0, 5, 10, 15, 20, 25,
1, 6, 11, 16, 21, 26,
2, 7, 12, 17, 22, 27,
3, 8, 13, 18, 23, 28,
4, 9, 14, 19, 24, 29},
{1, 1, 10, 3},
{0, 10, 20, 1, 11, 21, 2, 12, 22, 3, 13, 23,
4, 14, 24, 5, 15, 25, 6, 16, 26, 7, 17, 27,
8, 18, 28, 9, 19, 29});
}
TEST_F(ReOrganizeTest, Complex) {
TestReOrganize({1, 2, 2, 6},
{0, 4, 8, 12, 16, 20,
1, 5, 9, 13, 17, 21,
2, 6, 10, 14, 18, 22,
3, 7, 11, 15, 19, 23},
{1, 2, 6, 2},
{0, 12, 1, 13, 4, 16, 5, 17, 8, 20, 9, 21,
2, 14, 3, 15, 6, 18, 7, 19, 10, 22, 11, 23});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -26,16 +26,16 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::OPENCL, float>);
ResizeBilinearOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ResizeBilinearOp<DeviceType::OPENCL, half>);
ResizeBilinearOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -38,7 +38,7 @@ void ResizeBilinearBenchmark(int iters,
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input",
{batch, channels, input_height, input_width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input",
{batch, input_height, input_width, channels});
} else {
......@@ -55,7 +55,7 @@ void ResizeBilinearBenchmark(int iters,
.AddIntsArg("size", {output_height, output_width})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
......@@ -99,8 +99,8 @@ void ResizeBilinearBenchmark(int iters,
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, OPENCL); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, OPENCL);
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480);
......
......@@ -132,7 +132,7 @@ void TestRandomResizeBilinear() {
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -147,8 +147,6 @@ void TestRandomResizeBilinear() {
ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
// TODO(someone): support NEON
}
// Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"),
......@@ -158,7 +156,7 @@ void TestRandomResizeBilinear() {
} // namespace
TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::OPENCL>();
TestRandomResizeBilinear<DeviceType::GPU>();
}
} // namespace test
......
......@@ -26,16 +26,16 @@ void Register_Slice(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::OPENCL, float>);
SliceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SliceOp<DeviceType::OPENCL, half>);
SliceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -38,7 +38,7 @@ void BMSliceHelper(int iters,
GenerateRandomRealTypeData(input_shape, &input_data);
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -85,8 +85,8 @@ void BMSliceHelper(int iters,
#define BM_SLICE(N, H, W, C, NO) \
BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
BM_SLICE_MACRO(N, H, W, C, NO, float, OPENCL); \
BM_SLICE_MACRO(N, H, W, C, NO, half, OPENCL);
BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \
BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
BM_SLICE(1, 32, 32, 32, 2);
BM_SLICE(1, 32, 32, 128, 2);
......
......@@ -51,7 +51,7 @@ void RandomTest(const int num_outputs, const int axis) {
GenerateRandomRealTypeData(input_shape, &input_data);
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -75,7 +75,7 @@ void RandomTest(const int num_outputs, const int axis) {
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
if (D == DeviceType::GPU) {
for (int i = 0; i < num_outputs; ++i) {
ImageToBuffer<D, float>(&net,
MakeString("OutputImage", i),
......@@ -130,15 +130,15 @@ TEST_F(SliceOpTest, CPUAxis1) {
}
TEST_F(SliceOpTest, OPENCLFloat) {
RandomTest<DeviceType::OPENCL, float>(2, 3);
RandomTest<DeviceType::OPENCL, float>(4, 3);
RandomTest<DeviceType::OPENCL, float>(11, 3);
RandomTest<DeviceType::GPU, float>(2, 3);
RandomTest<DeviceType::GPU, float>(4, 3);
RandomTest<DeviceType::GPU, float>(11, 3);
}
TEST_F(SliceOpTest, OPENCLHalf) {
RandomTest<DeviceType::OPENCL, half>(2, 3);
RandomTest<DeviceType::OPENCL, half>(4, 3);
RandomTest<DeviceType::OPENCL, half>(11, 3);
RandomTest<DeviceType::GPU, half>(2, 3);
RandomTest<DeviceType::GPU, half>(4, 3);
RandomTest<DeviceType::GPU, half>(11, 3);
}
} // namespace test
......
......@@ -26,16 +26,16 @@ void Register_Softmax(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::OPENCL, float>);
SoftmaxOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SoftmaxOp<DeviceType::OPENCL, half>);
SoftmaxOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -33,7 +33,7 @@ void SoftmaxBenchmark(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -44,7 +44,7 @@ void SoftmaxBenchmark(
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -82,8 +82,8 @@ void SoftmaxBenchmark(
#define BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \
BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL);
BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
BM_SOFTMAX(1, 2, 512, 512);
BM_SOFTMAX(1, 3, 512, 512);
......
......@@ -40,7 +40,7 @@ void Simple() {
// Run
net.RunOp(D);
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -68,7 +68,7 @@ void Simple() {
} // namespace
TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); }
namespace {
template<DeviceType D>
......@@ -114,18 +114,18 @@ void Complex(const std::vector<index_t> &logits_shape) {
} // namespace
TEST_F(SoftmaxOpTest, OPENCLAligned) {
Complex<DeviceType::OPENCL>({1, 256, 256, 3});
Complex<DeviceType::OPENCL>({1, 128, 128, 16});
Complex<DeviceType::GPU>({1, 256, 256, 3});
Complex<DeviceType::GPU>({1, 128, 128, 16});
}
TEST_F(SoftmaxOpTest, OPENCLMulBatchAligned) {
Complex<DeviceType::OPENCL>({5, 64, 64, 3});
Complex<DeviceType::OPENCL>({8, 128, 128, 8});
Complex<DeviceType::GPU>({5, 64, 64, 3});
Complex<DeviceType::GPU>({8, 128, 128, 8});
}
TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
Complex<DeviceType::OPENCL>({1, 113, 107, 13});
Complex<DeviceType::OPENCL>({5, 211, 107, 1});
Complex<DeviceType::GPU>({1, 113, 107, 13});
Complex<DeviceType::GPU>({5, 211, 107, 1});
}
} // namespace test
......
......@@ -20,16 +20,16 @@ namespace ops {
void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::OPENCL, float>);
SpaceToBatchNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::OPENCL, half>);
SpaceToBatchNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -65,7 +65,7 @@ void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, OPENCL);
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU);
BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
......
......@@ -85,7 +85,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
const std::vector<index_t> &batch_shape,
const std::vector<float> &batch_data) {
auto space_tensor = std::unique_ptr<Tensor>(new Tensor(
GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
space_tensor->Resize(space_shape);
{
Tensor::MappingGuard space_mapper(space_tensor.get());
......@@ -97,7 +97,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
}
auto batch_tensor = std::unique_ptr<Tensor>(new Tensor(
GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
batch_tensor->Resize(batch_shape);
{
Tensor::MappingGuard batch_mapper(batch_tensor.get());
......@@ -106,10 +106,10 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
}
RunSpaceToBatch<DeviceType::OPENCL>(space_shape, space_data, block_data,
RunSpaceToBatch<DeviceType::GPU>(space_shape, space_data, block_data,
padding_data, batch_tensor.get());
RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data, block_data,
RunBatchToSpace<DeviceType::GPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get());
}
} // namespace
......
......@@ -26,16 +26,16 @@ void Register_SpaceToDepth(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToDepthOp<DeviceType::OPENCL, float>);
SpaceToDepthOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToDepthOp<DeviceType::OPENCL, half>);
SpaceToDepthOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -45,7 +45,7 @@ class SpaceToDepthOp : public Operator<D, T> {
input_height = input->dim(2);
input_width = input->dim(3);
input_depth = input->dim(1);
} else if (D == OPENCL) {
} else if (D == GPU) {
input_height = input->dim(1);
input_width = input->dim(2);
input_depth = input->dim(3);
......
......@@ -31,7 +31,7 @@ void SpaceToDepth(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
......@@ -42,7 +42,7 @@ void SpaceToDepth(
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::OPENCL) {
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -82,8 +82,8 @@ void SpaceToDepth(
#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, OPENCL); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, OPENCL);
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
......
......@@ -147,23 +147,23 @@ void WinogradConvolution(const index_t batch,
} // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::SAME);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37,
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31,
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME);
}
TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32,
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME);
}
......
......@@ -20,16 +20,16 @@ namespace ops {
void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::OPENCL, float>);
WinogradInverseTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::OPENCL, half>);
WinogradInverseTransformOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -20,16 +20,16 @@ namespace ops {
void Register_WinogradTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradTransformOp<DeviceType::OPENCL, float>);
WinogradTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::OPENCL)
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradTransformOp<DeviceType::OPENCL, half>);
WinogradTransformOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -62,7 +62,7 @@ void BMWinogradTransform(
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
......@@ -116,7 +116,7 @@ void BMWinogradInverseTransform(
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
......
......@@ -9,8 +9,7 @@ enum NetMode {
enum DeviceType {
CPU = 0; // In default, we will use CPU.
NEON = 1;
OPENCL = 2;
GPU = 2;
}
enum DataType {
......
......@@ -28,7 +28,7 @@ namespace mace {
const char *MaceVersion();
enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
......
......@@ -475,8 +475,6 @@ class CaffeConverter(object):
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
......@@ -984,15 +982,10 @@ class CaffeConverter(object):
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
if self.device == 'cpu':
op_def = self.CommonConvert(op, 'Reshape')
else:
op_def = self.CommonConvert(op, 'ReOrganize')
op_def = self.CommonConvert(op, 'Reshape')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
if self.device != 'cpu':
shape_param = shape_param[[0, 3, 1, 2]]
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
......
......@@ -508,8 +508,6 @@ class TFConverter(object):
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
if op_def.type == "Conv2D":
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
......@@ -958,14 +956,17 @@ class TFConverter(object):
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
self.transpose_filter_tensor[get_input_tensor(conv_op,
1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
self.transpose_filter_tensor[
get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2)
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(
get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
op_def.input.extend([output_name])
else:
self.transpose_filter_tensor[
get_input_tensor(conv_op, 1).name] = (3, 2, 0, 1)
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
......@@ -1020,7 +1021,6 @@ class TFConverter(object):
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
......@@ -1092,8 +1092,12 @@ class TFConverter(object):
op_def.output.extend([output.name for output in op.outputs])
paddings_arg = op_def.arg.add()
paddings_arg.name = 'paddings'
paddings_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
if self.device == 'gpu':
paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
else:
paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
paddings_value = paddings_value[[0, 3, 1, 2]]
paddings_arg.ints.extend(paddings_value.flat)
self.unused_tensor.add(get_input_tensor(op, 1).name)
if len(op.inputs) == 3:
constant_value_arg = op_def.arg.add()
......
......@@ -248,7 +248,7 @@ void MaceRun(const int in_out_size,
std::string filter_tensor_name = "filter";
std::string filter_tensor_img_name = filter_tensor_name + "_image";
const DeviceType device = DeviceType::OPENCL;
const DeviceType device = DeviceType::GPU;
NetDef net_def;
......@@ -300,7 +300,7 @@ void MaceRun(const int in_out_size,
}
}
CheckOutputs<DeviceType::OPENCL, T>(net_def, inputs, outputs);
CheckOutputs<DeviceType::GPU, T>(net_def, inputs, outputs);
}
} // namespace
......
......@@ -22,7 +22,7 @@
* --input_file=input_data \
* --output_file=mace.out \
* --model_data_file=model_data.data \
* --device=OPENCL
* --device=GPU
*/
#include <malloc.h>
#include <stdint.h>
......@@ -108,10 +108,8 @@ std::string FormatName(const std::string input) {
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("GPU") == 0) {
return DeviceType::GPU;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
......@@ -203,7 +201,7 @@ DEFINE_string(output_file,
DEFINE_string(model_data_file,
"",
"model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
......@@ -234,7 +232,7 @@ bool RunModel(const std::vector<std::string> &input_names,
FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
#ifdef MACE_ENABLE_OPENCL
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
......@@ -252,7 +250,7 @@ bool RunModel(const std::vector<std::string> &input_names,
new FileStorageFactory(kernel_file_path));
SetKVStorageFactory(storage_factory);
mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
int64_t t2 = NowMicros();
......@@ -329,7 +327,7 @@ bool RunModel(const std::vector<std::string> &input_names,
mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);
#ifdef MACE_ENABLE_OPENCL
if (device_type == DeviceType::OPENCL) {
if (device_type == DeviceType::GPU) {
WriteOpenCLPlatformInfo(kernel_file_path);
}
#endif // MACE_ENABLE_OPENCL
......
......@@ -69,7 +69,7 @@ def get_data_and_device_type(runtime):
device_type = "HEXAGON"
elif runtime == "gpu":
data_type = "DT_HALF"
device_type = "OPENCL"
device_type = "GPU"
elif runtime == "cpu":
data_type = "DT_FLOAT"
device_type = "CPU"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册