diff --git a/docs/developer/adding_a_new_op.md b/docs/developer/adding_a_new_op.md index 4fe30582ef966dcddb83fa4eab9fb19052af0c44..8488990375c0b1ce48a75081daf56cd04a5f6815 100644 --- a/docs/developer/adding_a_new_op.md +++ b/docs/developer/adding_a_new_op.md @@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) { Custom_Op); REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - Custom_Op); + Custom_Op); REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - Custom_Op); + Custom_Op); } } // namespace ops diff --git a/docs/user/introduction.md b/docs/user/introduction.md index 3ed1512f925bce526c9bc0255142bae2e9a19bba..963386612dfc16d8ed172ca31609ee167b5e7624 100644 --- a/docs/user/introduction.md +++ b/docs/user/introduction.md @@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); //3. 声明设备类型 -DeviceType device_type = DeviceType::OPENCL; +DeviceType device_type = DeviceType::GPU; //4. 定义输入输出名称数组 std::vector input_names = {...}; @@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) { //6. 创建MaceEngine对象 mace::MaceEngine engine(&net_def, device_type, input_names, output_names); -//7. 如果设备类型是OPENCL或HEXAGON,可以在此释放model_data -if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { +//7. 如果设备类型是GPU或者HEXAGON,可以在此释放model_data +if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 0b6426d13e6117f8871c79b149ef7ef72361227b..00f46ab615eb6a0efdc30a364271d27881154b63 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -108,10 +108,8 @@ inline int64_t NowMicros() { DeviceType ParseDeviceType(const std::string &device_str) { if (device_str.compare("CPU") == 0) { return DeviceType::CPU; - } else if (device_str.compare("NEON") == 0) { - return DeviceType::NEON; - } else if (device_str.compare("OPENCL") == 0) { - return DeviceType::OPENCL; + } else if (device_str.compare("GPU") == 0) { + return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; } else { @@ -198,7 +196,7 @@ bool Run(MaceEngine *engine, return true; } -DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]"); +DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]"); DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); DEFINE_string(output_node, "output_node0,output_node1", @@ -279,7 +277,7 @@ int Main(int argc, char **argv) { mace::SetOpenMPThreadPolicy( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); @@ -347,7 +345,7 @@ int Main(int argc, char **argv) { LOG(INFO) << "Run init"; std::unique_ptr engine_ptr( new mace::MaceEngine(&net_def, device_type, input_names, output_names)); - if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc index 51b294aa49ea5db9db51e08e8a2a0f376c41fed0..fd19be5ca7063d2a7a19e1d61423ef5cb517a1a3 100644 --- a/mace/benchmark/model_throughput_test.cc +++ b/mace/benchmark/model_throughput_test.cc @@ -141,10 +141,8 @@ std::string FormatName(const std::string input) { DeviceType ParseDeviceType(const std::string &device_str) { if (device_str.compare("CPU") == 0) { return DeviceType::CPU; - } else if (device_str.compare("NEON") == 0) { - return DeviceType::NEON; - } else if (device_str.compare("OPENCL") == 0) { - return DeviceType::OPENCL; + } else if (device_str.compare("GPU") == 0) { + return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; } else { @@ -277,7 +275,7 @@ int Main(int argc, char **argv) { FLAGS_gpu_model_data_file.c_str()); NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); - mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names, + mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names, output_names); mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc index 53fd5851f22fe5d625e1121803c3713b6bee7892..07776bc12fbcf6fd9db34577d8a0ea63a766f865 100644 --- a/mace/core/allocator.cc +++ b/mace/core/allocator.cc @@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) { } MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator()); -MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator()); #ifdef MACE_ENABLE_OPENCL -MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator()); +MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator()); #endif MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator()); diff --git a/mace/core/buffer.h b/mace/core/buffer.h index 3c30e8d430bd637fd9489b7a2330a711c32c0f37..d822c90a5de359cbe7844687e30f19926956710e 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -189,7 +189,7 @@ class Image : public BufferBase { public: Image() : BufferBase(0), - allocator_(GetDeviceAllocator(OPENCL)), + allocator_(GetDeviceAllocator(GPU)), buf_(nullptr), mapped_buf_(nullptr) {} @@ -198,7 +198,7 @@ class Image : public BufferBase { std::accumulate( shape.begin(), shape.end(), 1, std::multiplies()) * GetEnumTypeSize(data_type)), - allocator_(GetDeviceAllocator(OPENCL)), + allocator_(GetDeviceAllocator(GPU)), mapped_buf_(nullptr) { shape_ = shape; data_type_ = data_type; diff --git a/mace/core/mace.cc b/mace/core/mace.cc index 04f66bac031653f2c00daece8e47320b5208eb42..ae603107e2f1d531a8cbce58460830ebc5183840 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run( input_tensors.push_back(input_tensor); } for (auto &output : *outputs) { - if (device_type_ == DeviceType::OPENCL) { + if (device_type_ == DeviceType::GPU) { MACE_CHECK(output.second.shape().size() == 4, "The outputs' shape must be 4-dimension with NHWC format," " please use 1 to fill missing dimensions"); @@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run( #endif #ifdef MACE_ENABLE_OPENCL - if (device_type_ == OPENCL) { + if (device_type_ == GPU) { OpenCLRuntime::Global()->SaveBuiltCLProgram(); } #endif diff --git a/mace/core/net.cc b/mace/core/net.cc index 72f186c53d297db51f2a246be40b646a0ba47c23..464f8a1ace32c9a7868c053c38f4990e953c275f 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) { auto &op = *iter; MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", op->debug_def().type(), ")"); - bool future_wait = (device_type_ == DeviceType::OPENCL && + bool future_wait = (device_type_ == DeviceType::GPU && (run_metadata != nullptr || std::distance(iter, operators_.end()) == 1)); diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 403639c2cfeaaf505c4ffb05ba28c49ce8100ba9..a260b2c48d9c712f36b61362a5b1d83449f3c8f5 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry); extern void Register_Eltwise(OperatorRegistry *op_registry); extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry); extern void Register_FullyConnected(OperatorRegistry *op_registry); -extern void Register_FusedConv2D(OperatorRegistry *op_registry); extern void Register_LocalResponseNorm(OperatorRegistry *op_registry); extern void Register_MatMul(OperatorRegistry *op_registry); extern void Register_Pad(OperatorRegistry *op_registry); @@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry); extern void Register_Proposal(OperatorRegistry *op_registry); extern void Register_PSROIAlign(OperatorRegistry *op_registry); extern void Register_Quantize(OperatorRegistry *op_registry); -extern void Register_ReOrganize(OperatorRegistry *op_registry); extern void Register_Requantize(OperatorRegistry *op_registry); extern void Register_Reshape(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry); @@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() { ops::Register_Eltwise(this); ops::Register_FoldedBatchNorm(this); ops::Register_FullyConnected(this); - ops::Register_FusedConv2D(this); ops::Register_LocalResponseNorm(this); ops::Register_MatMul(this); ops::Register_Pad(this); @@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() { ops::Register_PSROIAlign(this); ops::Register_Quantize(this); ops::Register_Requantize(this); - ops::Register_ReOrganize(this); ops::Register_Reshape(this); ops::Register_ResizeBilinear(this); ops::Register_Slice(this); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 14caa64931d9eee19f92c0c6f13c05dc7f77765e..ce451491c454a1213e9db7bf446ad81152877bc3 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { VLOG(3) << "Model data size: " << model_data_size; if (model_data_size > 0) { - if (type == DeviceType::CPU || type == DeviceType::NEON) { + if (type == DeviceType::CPU) { tensor_buffer_ = std::unique_ptr( new Buffer(GetDeviceAllocator(type), model_data_ptr, @@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { tensor_map_[const_tensor.name()] = std::move(tensor); } - if (type == DeviceType::CPU || type == DeviceType::OPENCL) { + if (type == DeviceType::CPU || type == DeviceType::GPU) { CreateOutputTensorBuffer(net_def, type); } } @@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, } MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); for (auto &mem_block : net_def.mem_arena().mem_block()) { - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { std::unique_ptr image_buf( new Image({mem_block.x(), mem_block.y()}, dtype)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), @@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::unique_ptr tensor (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype)); tensor->SetSourceOpName(op.name()); - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << " Mem: " << mem_ids[i] << " Image shape: " @@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, } ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { - if (device_type == CPU || device_type == NEON) { + if (device_type == CPU) { return host_scratch_buffer_.get(); } else { return nullptr; diff --git a/mace/examples/example.cc b/mace/examples/example.cc index 424f06ec188d6570e5db084a39abb3862afe0e8a..63ebde3ad7e617df19aced827f004a63fa14e302 100644 --- a/mace/examples/example.cc +++ b/mace/examples/example.cc @@ -22,7 +22,7 @@ * --input_file=input_data \ * --output_file=mace.out \ * --model_data_file=model_data.data \ - * --device=OPENCL + * --device=GPU */ #include #include @@ -102,10 +102,8 @@ std::string FormatName(const std::string input) { DeviceType ParseDeviceType(const std::string &device_str) { if (device_str.compare("CPU") == 0) { return DeviceType::CPU; - } else if (device_str.compare("NEON") == 0) { - return DeviceType::NEON; - } else if (device_str.compare("OPENCL") == 0) { - return DeviceType::OPENCL; + } else if (device_str.compare("GPU") == 0) { + return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; } else { @@ -135,7 +133,7 @@ DEFINE_string(output_file, DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); -DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); +DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); @@ -160,7 +158,7 @@ bool RunModel(const std::vector &input_names, MaceStatus res = mace::SetOpenMPThreadPolicy( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); @@ -178,7 +176,7 @@ bool RunModel(const std::vector &input_names, // Init model mace::MaceEngine engine(&net_def, device_type, input_names, output_names); - if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 8d7b70820f730c5aea1f75a592d49fbde870086d..ca69aa3e516a9e4a2c549fbd97a593a62874aca2 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -162,7 +162,7 @@ class ActivationFunctor { #ifdef MACE_ENABLE_OPENCL template -class ActivationFunctor { +class ActivationFunctor { public: ActivationFunctor(ActivationType type, T relux_max_limit) : activation_(type), relux_max_limit_(static_cast(relux_max_limit)) {} diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 14221e901cb896efbdc4ea3fa38e2eeb5fb5b1bb..abc7efd837db5bdcfc9d5e41bd8704549635e901 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -93,7 +93,7 @@ struct AddNFunctor { #ifdef MACE_ENABLE_OPENCL template -struct AddNFunctor { +struct AddNFunctor { void operator()(const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future); diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index f65949a86ef269f196916279b47d89074733d3db..48abc908a35f2c7b883115a18bc137c4cb52211e 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -128,7 +128,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct BatchNormFunctor : BatchNormFunctorBase { +struct BatchNormFunctor : BatchNormFunctorBase { BatchNormFunctor(const bool folded_constant, const ActivationType activation, const float relux_max_limit) diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index b1aa40e9654f5126dd416983d2822e54a6a91e30..a6df23501666c1bb2eca1edfbcb2191a08217fca 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -65,7 +65,7 @@ struct BiasAddFunctor { #ifdef MACE_ENABLE_OPENCL template -struct BiasAddFunctor { +struct BiasAddFunctor { void operator()(const Tensor *input, const Tensor *bias, Tensor *output, diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index b8568f5f110d37d8c83746d6c8779ce2a23d1baa..b2d9822d99b8e22e6afd47526afab4fe9597f13a 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { }; template -struct BufferToImageFunctor : BufferToImageFunctorBase { +struct BufferToImageFunctor : BufferToImageFunctorBase { explicit BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {} void operator()(Tensor *input, diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index 06e50708c294394d74dab1180c17f80fdb849744..2aa3d600137c46d28b1c46d15c4917efd17376b0 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -67,7 +67,7 @@ struct ChannelShuffleFunctor { #ifdef MACE_ENABLE_OPENCL template -struct ChannelShuffleFunctor { +struct ChannelShuffleFunctor { explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 6ae7a769e55488b44f711f66e8c34ab3f3736c7e..69ee79729f845e00870a705875247d1b390a4925 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct ConcatFunctor : ConcatFunctorBase { +struct ConcatFunctor : ConcatFunctorBase { explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} void operator()(const std::vector &input_list, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index b52d8e6359ffffc629e7fba156d7920800c3dd36..be134f7b24d63812de449eff5f046f87cdcb02cb 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -615,7 +615,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct Conv2dFunctor : Conv2dFunctorBase { +struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(const int *strides, const Padding &padding_type, const std::vector &paddings, diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 2612e073d6d183ec6b9d0085eb33f83d0d55923e..193dac61cfc26cc172b2353628c71d0167c48aa9 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor { #ifdef MACE_ENABLE_OPENCL template -struct DepthToSpaceOpFunctor { +struct DepthToSpaceOpFunctor { DepthToSpaceOpFunctor(const int block_size, bool d2s) : block_size_(block_size), d2s_(d2s) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 0198b8e14fc9dc3cfbfc395678dfef3ecd22dc5b..a0f0b1e3b4041bbe7f56c3d8b3bf6a1ccfadbd0b 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor #ifdef MACE_ENABLE_OPENCL template -struct DepthwiseConv2dFunctor +struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctor(const int *strides, const Padding padding_type, diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 703a515ef76eedb8523d6d9823ae3d9703f67318..134a06d22ebd2ec88f61abe748d4c5dcce60cc1d 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -363,7 +363,7 @@ struct EltwiseFunctor: EltwiseFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct EltwiseFunctor : EltwiseFunctorBase { +struct EltwiseFunctor : EltwiseFunctorBase { EltwiseFunctor(const EltwiseType type, const std::vector &coeff, const float value) diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 3fa4d6129a6507168e1913a4ac55dae608c1a320..0ac9ad5518f7cb34ae5c16a7ca5e9396c6aa3770 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -88,7 +88,7 @@ struct FullyConnectedFunctor: FullyConnectedBase { #ifdef MACE_ENABLE_OPENCL template -struct FullyConnectedFunctor : FullyConnectedBase { +struct FullyConnectedFunctor : FullyConnectedBase { FullyConnectedFunctor(const int /*BufferType*/ weight_type, const ActivationType activation, const float relux_max_limit) diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index f572f63d62ec456a828a9d8ab2cf00cd9a48d15b..0d94d2c571afcf11e2206654d069b4656efc727d 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -75,7 +75,7 @@ struct MatMulFunctor { #ifdef MACE_ENABLE_OPENCL template -struct MatMulFunctor { +struct MatMulFunctor { void operator()(const Tensor *A, const Tensor *B, Tensor *C, diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 04b844e8e4e64f9098c8b3d4d5000599f75ed713..0e29e00f34f93935292304bddc0b91be8297bca8 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void ActivationFunctor::operator()(const Tensor *input, +void ActivationFunctor::operator()(const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future) { @@ -46,7 +46,7 @@ void ActivationFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -124,7 +124,7 @@ void ActivationFunctor::operator()(const Tensor *input, } } -template struct ActivationFunctor; -template struct ActivationFunctor; +template struct ActivationFunctor; +template struct ActivationFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index a67c6aaf120f8a04acf6ba065261fa9aa6b8f377..b4e2493f7876cc8b2d12dae66a5c70be3606ebb4 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -22,7 +22,7 @@ namespace mace { namespace kernels { template -void AddNFunctor::operator()( +void AddNFunctor::operator()( const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future) { @@ -58,7 +58,7 @@ void AddNFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -119,9 +119,9 @@ void AddNFunctor::operator()( } } -template struct AddNFunctor; +template struct AddNFunctor; -template struct AddNFunctor; +template struct AddNFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 84f926334c71494436744a7df5ff9151a67e4ed6..21adfd9626f5e60f9962fc6576299a007ffc2bad 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void BatchNormFunctor::operator()(const Tensor *input, +void BatchNormFunctor::operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, const Tensor *mean, @@ -56,7 +56,7 @@ void BatchNormFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -130,7 +130,7 @@ void BatchNormFunctor::operator()(const Tensor *input, } } -template struct BatchNormFunctor; -template struct BatchNormFunctor; +template struct BatchNormFunctor; +template struct BatchNormFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index c9b1d6e8b3a8c93762e5ab43f6aa811f64942da8..5cffe75caf94f61af804ab71f6381233df6012e1 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -22,7 +22,7 @@ namespace mace { namespace kernels { template -void BiasAddFunctor::operator()(const Tensor *input, +void BiasAddFunctor::operator()(const Tensor *input, const Tensor *bias, Tensor *output, StatsFuture *future) { @@ -49,7 +49,7 @@ void BiasAddFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -115,7 +115,7 @@ void BiasAddFunctor::operator()(const Tensor *input, } } -template struct BiasAddFunctor; -template struct BiasAddFunctor; +template struct BiasAddFunctor; +template struct BiasAddFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index dcaa7b4c9ae1705b73526dcb369e97ec135fcd86..c2593d76f39c226315b27a2887446eeb75730513 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -20,7 +20,7 @@ namespace mace { namespace kernels { template -void BufferToImageFunctor::operator()( +void BufferToImageFunctor::operator()( Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { std::vector image_shape; @@ -95,7 +95,7 @@ void BufferToImageFunctor::operator()( built_options.emplace("-DOUT_OF_RANGE_CHECK"); if (!kernel_error_) { kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -177,8 +177,8 @@ void BufferToImageFunctor::operator()( } } -template struct BufferToImageFunctor; -template struct BufferToImageFunctor; +template struct BufferToImageFunctor; +template struct BufferToImageFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 0a6a460798e47da02bf91a37e4ba8282d1ef0535..b30ecb69b60cb0a12dadf63c245e606227d63ca5 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void ChannelShuffleFunctor::operator()( +void ChannelShuffleFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { @@ -56,7 +56,7 @@ void ChannelShuffleFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -108,8 +108,8 @@ void ChannelShuffleFunctor::operator()( } template -struct ChannelShuffleFunctor; +struct ChannelShuffleFunctor; template -struct ChannelShuffleFunctor; +struct ChannelShuffleFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index b22896ea84a71253409fdd54cc7c6a1d0aec919d..4dacf8cfa8bf8fba02c669e889f990831c1fc9ef 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel, } template -void ConcatFunctor::operator()( +void ConcatFunctor::operator()( const std::vector &input_list, Tensor *output, StatsFuture *future) { @@ -264,8 +264,8 @@ void ConcatFunctor::operator()( } } -template struct ConcatFunctor; -template struct ConcatFunctor; +template struct ConcatFunctor; +template struct ConcatFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index f42f1428cc91015d49702bec9e69b438550834c6..696b1124e9c85f6638785ea10495f50eb266e4de 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::unique_ptr *kernel_error); template -void Conv2dFunctor::operator()(const Tensor *input, +void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, @@ -128,8 +128,8 @@ void Conv2dFunctor::operator()(const Tensor *input, } } -template struct Conv2dFunctor; -template struct Conv2dFunctor; +template struct Conv2dFunctor; +template struct Conv2dFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 9f7694bc6517b66cc4dc146babfc7354c06081c5..d148edb2bbdefa587f10ac28e49ba6c8c95525b2 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index fb86602c99fc5efaefa2b7f2cb9d5c272798a50c..a51ff2527221509ce197209e2a8b5d2898f39077 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 9db8df0f64bf676a2d090532299df6a138541162..b8431193358909deb7fa435b1e72e3a3f843c30c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index 1ecbc891bfe9682b3c0f64a23f393e5ada3c5be4..f5f45ca94a74f91471060947e9717182e188b44a 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void DepthToSpaceOpFunctor::operator()( +void DepthToSpaceOpFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { const index_t batch = input->dim(0); const index_t input_height = input->dim(1); @@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor::operator()( } } -template struct DepthToSpaceOpFunctor; -template struct DepthToSpaceOpFunctor; +template struct DepthToSpaceOpFunctor; +template struct DepthToSpaceOpFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 534aa64e6152e2f6eaa930f8e98c8960711d5479..67bfbf7a6c051c156b1db4b0e9114fcc09cdb1ce 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, } template -void DepthwiseConv2dFunctor::operator()( +void DepthwiseConv2dFunctor::operator()( const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor::operator()( &kwg_size_, &kernel_error_); } -template struct DepthwiseConv2dFunctor; -template struct DepthwiseConv2dFunctor; +template struct DepthwiseConv2dFunctor; +template struct DepthwiseConv2dFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index 3e3d2e471580d59f1809064b119af73e1fd5924e..d834c292c51697adacb36cb849cbb2b50b6085fc 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { template -void EltwiseFunctor::operator()(const Tensor *input0, +void EltwiseFunctor::operator()(const Tensor *input0, const Tensor *input1, Tensor *output, StatsFuture *future) { @@ -74,7 +74,7 @@ void EltwiseFunctor::operator()(const Tensor *input0, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -129,7 +129,7 @@ void EltwiseFunctor::operator()(const Tensor *input0, } } -template struct EltwiseFunctor; -template struct EltwiseFunctor; +template struct EltwiseFunctor; +template struct EltwiseFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index 50f8ed5cd5e9c3ad40b8f9cc0d3a5928fc1fb93c..378a9d835436f7f6cd8932935dec8f58d3d4abdc 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel, } // namespace template -void FullyConnectedFunctor::operator()( +void FullyConnectedFunctor::operator()( const Tensor *input, const Tensor *weight, const Tensor *bias, @@ -305,9 +305,9 @@ void FullyConnectedFunctor::operator()( } } -template struct FullyConnectedFunctor; +template struct FullyConnectedFunctor; -template struct FullyConnectedFunctor; +template struct FullyConnectedFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index d941040ed83ed1151569c5139eb30f73436358b5..b307c44572932bbcdbb5abee14bca75714abc36b 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { template -void MatMulFunctor::operator()(const Tensor *A, +void MatMulFunctor::operator()(const Tensor *A, const Tensor *B, Tensor *C, StatsFuture *future) { @@ -53,7 +53,7 @@ void MatMulFunctor::operator()(const Tensor *A, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -98,9 +98,9 @@ void MatMulFunctor::operator()(const Tensor *A, } } -template struct MatMulFunctor; +template struct MatMulFunctor; -template struct MatMulFunctor; +template struct MatMulFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index b310a93afea4bff99053347864ee7695329d3ded..a67cae0d5966a8a3becf30631d657d9ca016e9b6 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error->Map(nullptr); *(kernel_error->mutable_data()) = 0; kernel_error->UnMap(); @@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) { std::vector buffer_shape = {batch, height, width, channels}; Workspace ws; Tensor *buffer = ws.CreateTensor("Buffer", - GetDeviceAllocator(DeviceType::OPENCL), + GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v()); buffer->Resize(buffer_shape); std::vector image_shape; Tensor *image = ws.CreateTensor("Image", - GetDeviceAllocator(DeviceType::OPENCL), + GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v()); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); image->ResizeImage(buffer->shape(), image_shape); diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 45f27d6dae21ec2f8b2c3d5b9d4913c913f753b0..46eb496832c1536c2c0d8ee3ef645062ad3a405e 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { template -void PadFunctor::operator()( +void PadFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { @@ -59,7 +59,7 @@ void PadFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -115,9 +115,9 @@ void PadFunctor::operator()( } template -struct PadFunctor; +struct PadFunctor; template -struct PadFunctor; +struct PadFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index e3d9081e007446d54e5ccdb68014d48caa1855f2..5d31b76f325111f289f0aa88fe286ccc93357a36 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -22,7 +22,7 @@ namespace mace { namespace kernels { template -void PoolingFunctor::operator()(const Tensor *input, +void PoolingFunctor::operator()(const Tensor *input, Tensor *output, StatsFuture *future) { MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) @@ -50,7 +50,7 @@ void PoolingFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -148,7 +148,7 @@ void PoolingFunctor::operator()(const Tensor *input, } } -template struct PoolingFunctor; -template struct PoolingFunctor; +template struct PoolingFunctor; +template struct PoolingFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index be4fe3cd103d7f5ce25fe41c0a26d06d7b69d8fd..1b154bb1adb97657f8e625a5fe839fbc17347550 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void ResizeBilinearFunctor::operator()( +void ResizeBilinearFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { const index_t batch = input->dim(0); const index_t in_height = input->dim(1); @@ -50,7 +50,7 @@ void ResizeBilinearFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -113,8 +113,8 @@ void ResizeBilinearFunctor::operator()( } } -template struct ResizeBilinearFunctor; -template struct ResizeBilinearFunctor; +template struct ResizeBilinearFunctor; +template struct ResizeBilinearFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index ba93231ddd965bb90beb5ded97d675987e7b4630..29b5f909b28e1504f6a9c825c3e50e1a3b44e676 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { template -void SliceFunctor::operator()( +void SliceFunctor::operator()( const Tensor *input, const std::vector &output_list, StatsFuture *future) { @@ -51,7 +51,7 @@ void SliceFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -133,9 +133,9 @@ void SliceFunctor::operator()( } template -struct SliceFunctor; +struct SliceFunctor; template -struct SliceFunctor; +struct SliceFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 7e463997b1f70c1c7593ead2fb5e48dd70a728dd..47c10dca9fa70e3620dd6ae52e24aff6204c806c 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -23,7 +23,7 @@ namespace mace { namespace kernels { template -void SoftmaxFunctor::operator()(const Tensor *logits, +void SoftmaxFunctor::operator()(const Tensor *logits, Tensor *output, StatsFuture *future) { const index_t batch = logits->dim(0); @@ -49,7 +49,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -95,7 +95,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, } } -template struct SoftmaxFunctor; -template struct SoftmaxFunctor; +template struct SoftmaxFunctor; +template struct SoftmaxFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index c5b9df5a0b72bcf758c0728c63bb7e4a883afde9..454d2d0a0d0e148618262ebb710d3a9712ec2ca2 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -25,7 +25,7 @@ namespace mace { namespace kernels { template -void SpaceToBatchFunctor::operator()( +void SpaceToBatchFunctor::operator()( Tensor *space_tensor, const std::vector &output_shape, Tensor *batch_tensor, @@ -60,7 +60,7 @@ void SpaceToBatchFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -120,8 +120,8 @@ void SpaceToBatchFunctor::operator()( } } -template struct SpaceToBatchFunctor; -template struct SpaceToBatchFunctor; +template struct SpaceToBatchFunctor; +template struct SpaceToBatchFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 591a6208e654ac5edbf12f7687f844e5e52f6acc..62a7ca601f5a29b5387f90938281c3cb73128235 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -22,7 +22,7 @@ namespace mace { namespace kernels { template -void WinogradTransformFunctor::operator()( +void WinogradTransformFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); @@ -39,7 +39,7 @@ void WinogradTransformFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -117,7 +117,7 @@ void WinogradTransformFunctor::operator()( } template -void WinogradInverseTransformFunctor::operator()( +void WinogradInverseTransformFunctor::operator()( const Tensor *input_tensor, const Tensor *bias, Tensor *output_tensor, @@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor::operator()( } } -template struct WinogradTransformFunctor; -template struct WinogradTransformFunctor; +template struct WinogradTransformFunctor; +template struct WinogradTransformFunctor; -template struct WinogradInverseTransformFunctor; -template struct WinogradInverseTransformFunctor; +template struct WinogradInverseTransformFunctor; +template struct WinogradInverseTransformFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h index 62728e5b1940bdc0c65ac707914eb914aeae5612..bd61003205bc8d75dae104e8142d2ec4fff4e767 100644 --- a/mace/kernels/pad.h +++ b/mace/kernels/pad.h @@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase { std::fill(output_ptr, output_ptr + output->size(), this->constant_value_); const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channel = input->dim(3); + const index_t channel = input->dim(1); + const index_t height = input->dim(2); + const index_t width = input->dim(3); #pragma omp parallel for collapse(3) for (index_t b = 0; b < batch; ++b) { - for (index_t h = 0; h < height; ++h) { - for (index_t w = 0; w < width; ++w) { - const index_t in_offset = (((b * height + h) * width) + w) * channel; + for (index_t c = 0; c < channel; ++c) { + for (index_t h = 0; h < height; ++h) { + const index_t in_offset = (((b * channel + c) * height) + h) * width; const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1) - + (h + this->paddings_[2])) * output->dim(2) - + (w + this->paddings_[4])) * output->dim(3) + + (c + this->paddings_[2])) * output->dim(2) + + (h + this->paddings_[4])) * output->dim(3) + this->paddings_[6]; memcpy(output_ptr + out_offset, input_ptr + in_offset, - channel * sizeof(T)); + width * sizeof(T)); } } } @@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct PadFunctor : PadFunctorBase { +struct PadFunctor : PadFunctorBase { PadFunctor(const std::vector &paddings, const float constant_value) : PadFunctorBase(paddings, constant_value) {} diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 95d9dff163f5b7de8b06319953e68fbb00f11029..384ce834e752aa2233c247638b5c24eb36b08f78 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -261,7 +261,7 @@ struct PoolingFunctor: PoolingFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct PoolingFunctor : PoolingFunctorBase { +struct PoolingFunctor : PoolingFunctorBase { PoolingFunctor(const PoolingType pooling_type, const int *kernels, const int *strides, diff --git a/mace/kernels/reorganize.h b/mace/kernels/reorganize.h deleted file mode 100644 index e987d3e15f494cee561e22a37e24a62c0bb88092..0000000000000000000000000000000000000000 --- a/mace/kernels/reorganize.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_REORGANIZE_H_ -#define MACE_KERNELS_REORGANIZE_H_ - -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - -namespace mace { -namespace kernels { - -template -struct ReOrganizeFunctor { - void operator()(const Tensor *input, - const std::vector &out_shape, - Tensor *output, - StatsFuture *future) { - const bool w2c = out_shape[3] > input->dim(3); - - const index_t height = input->dim(1); - const index_t input_width = input->dim(2); - const index_t input_chan = input->dim(3); - const index_t output_width = output->dim(2); - const index_t output_chan = output->dim(3); - - const T *input_ptr = input->data(); - T *output_ptr = output->mutable_data(); - - if (w2c) { - MACE_CHECK((out_shape[3] % input->dim(3)) == 0); - const index_t multiplier = out_shape[3] / input->dim(3); -#pragma omp parallel for collapse(4) - for (index_t n = 0; n < out_shape[0]; ++n) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - for (index_t c = 0; c < out_shape[3]; ++c) { - const index_t out_offset = - ((n * height + h) * output_width + w) - * output_chan + c; - const index_t in_w_idx = w + (c % multiplier) * output_width; - const index_t in_chan_idx = c / multiplier; - const index_t in_offset = - ((n * height + h) * input_width + in_w_idx) - * input_chan + in_chan_idx; - output_ptr[out_offset] = input_ptr[in_offset]; - } - } - } - } - } else { - MACE_CHECK((input->dim(3) % out_shape[3]) == 0); - const index_t multiplier = input->dim(3) / out_shape[3]; - -#pragma omp parallel for collapse(4) - for (index_t n = 0; n < out_shape[0]; ++n) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - for (index_t c = 0; c < out_shape[3]; ++c) { - const index_t out_offset = - ((n * height + h) * output_width + w) - * output_chan + c; - const index_t in_w_idx = w % input_width; - const index_t in_chan_idx = w / input_width + c * multiplier; - const index_t in_offset = - ((n * height + h) * input_width + in_w_idx) - * input_chan + in_chan_idx; - output_ptr[out_offset] = input_ptr[in_offset]; - } - } - } - } - } - } -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_REORGANIZE_H_ diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 59f049cb81aef22e5751a50a453b3707d90dcaad..6054f8988c5e80443c91280707c5d33149d5a57b 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -179,7 +179,7 @@ struct ResizeBilinearFunctor #ifdef MACE_ENABLE_OPENCL template -struct ResizeBilinearFunctor +struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ResizeBilinearFunctor(const std::vector &size, bool align_corners) : ResizeBilinearFunctorBase(size, align_corners) {} diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 32eddad47af21ab06f2f26eb61583ace6844b914..6d45d8c3cedb8b0ec99d9329049de74f7443ac2d 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct SliceFunctor : SliceFunctorBase { +struct SliceFunctor : SliceFunctorBase { explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} void operator()(const Tensor *input, diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index f06ca4a384e3c2836509be0aef55931c17dbe4b9..fae1d0651dd81ea9a67cd8c2df375a879bd5bc18 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -94,7 +94,7 @@ struct SoftmaxFunctor { #ifdef MACE_ENABLE_OPENCL template -struct SoftmaxFunctor { +struct SoftmaxFunctor { void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); cl::Kernel kernel_; diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 52119a45ca8ffc00831deffaacb7ea47635379f8..77b54990cf31239ea7e021b23c6a116080eaab7f 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { +struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { SpaceToBatchFunctor(const std::vector &paddings, const std::vector &block_shape, bool b2s) diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index 932604bc63cc786aa5e44096c74fff1b22d61c4c..6f1662a2a29cc05090414f9ab30927b49ff69bab 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct WinogradTransformFunctor +struct WinogradTransformFunctor : WinogradTransformFunctorBase { WinogradTransformFunctor(const Padding &padding_type, const std::vector &paddings) @@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { #ifdef MACE_ENABLE_OPENCL template -struct WinogradInverseTransformFunctor +struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { WinogradInverseTransformFunctor(const int batch, const int height, diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 98fb9948414b0e87132ef74522dfa632d1e8996b..0c90bc9e561bf13da75fad0c0b10bf85eedf30bf 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ActivationOp); + ActivationOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ActivationOp); + ActivationOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 3bc95e5245ac1ab69c7f211b2c33e69a4277faab..9c95b9ecf8f875e96fd5e6d3ca3b54284b9f78bf 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -33,7 +33,7 @@ void ReluBenchmark( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -45,7 +45,7 @@ void ReluBenchmark( .Output("Output") .AddStringArg("activation", "RELU") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -83,8 +83,8 @@ void ReluBenchmark( #define BM_RELU(N, C, H, W) \ BM_RELU_MACRO(N, C, H, W, float, CPU); \ - BM_RELU_MACRO(N, C, H, W, float, OPENCL); \ - BM_RELU_MACRO(N, C, H, W, half, OPENCL); + BM_RELU_MACRO(N, C, H, W, float, GPU); \ + BM_RELU_MACRO(N, C, H, W, half, GPU); BM_RELU(1, 1, 512, 512); BM_RELU(1, 3, 128, 128); @@ -107,7 +107,7 @@ void ReluxBenchmark( net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -151,8 +151,8 @@ void ReluxBenchmark( #define BM_RELUX(N, C, H, W) \ BM_RELUX_MACRO(N, C, H, W, float, CPU); \ - BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \ - BM_RELUX_MACRO(N, C, H, W, half, OPENCL); + BM_RELUX_MACRO(N, C, H, W, float, GPU); \ + BM_RELUX_MACRO(N, C, H, W, half, GPU); BM_RELUX(1, 1, 512, 512); BM_RELUX(1, 3, 128, 128); @@ -171,7 +171,7 @@ void PreluBenchmark( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -185,7 +185,7 @@ void PreluBenchmark( .Output("Output") .AddStringArg("activation", "PRELU") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Alpha", "AlphaImage", @@ -226,8 +226,8 @@ void PreluBenchmark( #define BM_PRELU(N, C, H, W) \ BM_PRELU_MACRO(N, C, H, W, float, CPU); \ - BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \ - BM_PRELU_MACRO(N, C, H, W, half, OPENCL); + BM_PRELU_MACRO(N, C, H, W, float, GPU); \ + BM_PRELU_MACRO(N, C, H, W, half, GPU); BM_PRELU(1, 1, 512, 512); BM_PRELU(1, 3, 128, 128); @@ -250,7 +250,7 @@ void TanhBenchmark( net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -292,8 +292,8 @@ void TanhBenchmark( #define BM_TANH(N, C, H, W) \ BM_TANH_MACRO(N, C, H, W, float, CPU); \ - BM_TANH_MACRO(N, C, H, W, float, OPENCL); \ - BM_TANH_MACRO(N, C, H, W, half, OPENCL); + BM_TANH_MACRO(N, C, H, W, float, GPU); \ + BM_TANH_MACRO(N, C, H, W, half, GPU); BM_TANH(1, 1, 512, 512); BM_TANH(1, 3, 128, 128); @@ -316,7 +316,7 @@ void SigmoidBenchmark( net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -359,8 +359,8 @@ void SigmoidBenchmark( #define BM_SIGMOID(N, C, H, W) \ BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ - BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \ - BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL); + BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ + BM_SIGMOID_MACRO(N, C, H, W, half, GPU); BM_SIGMOID(1, 1, 512, 512); BM_SIGMOID(1, 3, 128, 128); diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index d245d5c2e52f85238597bded4b247c246c512d22..a3b2ab1929838e392d40fe9df7269108f0355e72 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -31,7 +31,7 @@ void TestSimpleRelu() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -68,7 +68,7 @@ void TestSimpleRelu() { TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu(); } TEST_F(ActivationOpTest, OPENCLSimpleRelu) { - TestSimpleRelu(); + TestSimpleRelu(); } namespace { @@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() { // Add input data net.AddInputFromArray("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) { } TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { - TestUnalignedSimpleRelu(); + TestUnalignedSimpleRelu(); } @@ -131,7 +131,7 @@ void TestSimpleRelux() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -170,7 +170,7 @@ void TestSimpleRelux() { TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux(); } TEST_F(ActivationOpTest, OPENCLSimple) { - TestSimpleRelux(); + TestSimpleRelux(); } namespace { @@ -183,7 +183,7 @@ void TestSimpleReluRelux() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) { } TEST_F(ActivationOpTest, OPENCLSimpleRelux) { - TestSimpleReluRelux(); + TestSimpleReluRelux(); } namespace { @@ -238,7 +238,7 @@ void TestSimplePrelu() { {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); net.AddInputFromArray("Alpha", {2}, {2.0, 3.0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Alpha", "AlphaImage", @@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) { } TEST_F(ActivationOpTest, OPENCLSimplePrelu) { - TestSimplePrelu(); + TestSimplePrelu(); } namespace { @@ -296,7 +296,7 @@ void TestSimpleTanh() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -336,7 +336,7 @@ void TestSimpleTanh() { TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh(); } TEST_F(ActivationOpTest, OPENCLSimpleTanh) { - TestSimpleTanh(); + TestSimpleTanh(); } namespace { @@ -349,7 +349,7 @@ void TestSimpleSigmoid() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) { } TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) { - TestSimpleSigmoid(); + TestSimpleSigmoid(); } } // namespace test diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 41ad93d52d13a1fee20c28fe0db77076fc5e92f8..1ad27c2f53e3a9493c4cd6cc3c05500f400ec35f 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - AddNOp); + AddNOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - AddNOp); + AddNOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index bc18acf4264b12b0f79fe5f14bfd121f97f16ab9..36c9948e9e3e80bf91c31a7ad27a41b88a1bf4dc 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { net.AddRandomInput(MakeString("Input", i).c_str(), {n, h, w, c}); } - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { for (int i = 0; i < inputs; ++i) { BufferToImage(&net, MakeString("Input", i).c_str(), MakeString("InputImage", i).c_str(), @@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { #define BM_ADDN(INPUTS, N, H, W, C) \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL); + BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ + BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); BM_ADDN(2, 1, 256, 256, 32); BM_ADDN(2, 1, 128, 128, 32); diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 4331a567e9fbaf56605163a7e394d04714dff3f9..5b7c9d984b6b0be77d40eb70ec2bc6c76392b6d7 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -64,7 +64,7 @@ void SimpleAdd3() { {-0.1582, 2, 3, 4, 5, 6}); const int input_num = 4; - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { // run on gpu for (int i = 0; i < input_num; ++i) { BufferToImage(&net, MakeString("Input", i), @@ -105,7 +105,7 @@ void SimpleAdd3() { } // namespace TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3(); } -TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3(); } +TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3(); } namespace { template @@ -166,7 +166,7 @@ void RandomTest() { } } // namespace -TEST_F(AddnOpTest, OPENCLRandom) { RandomTest(); } +TEST_F(AddnOpTest, OPENCLRandom) { RandomTest(); } } // namespace test } // namespace ops diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index de1ce48fd415ee094429c210097b3af1b2e8c23f..c9e6db59887fdbadfa8b9abdcd9a57df07abfb9b 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BatchNormOp); + BatchNormOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BatchNormOp); + BatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 8817a0715f1874cdefe251559c86841d94686a7a..f07966061f1bd0fd58806d439679f43ff68c3d99 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -32,7 +32,7 @@ void BatchNorm( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -52,7 +52,7 @@ void BatchNorm( .AddFloatArg("epsilon", 1e-3) .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Scale", "ScaleImage", @@ -107,8 +107,8 @@ void BatchNorm( #define BM_BATCH_NORM(N, C, H, W) \ BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \ - BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL); + BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ + BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); BM_BATCH_NORM(1, 1, 512, 512); BM_BATCH_NORM(1, 3, 128, 128); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 4b4e6e8d76a7e15fa4eea5f8ea04f3850e49fc75..05cc2ab86958bae9c1d8cbba02ad017c343480d0 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -49,7 +49,7 @@ void Simple() { net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Scale", "ScaleImage", @@ -90,7 +90,7 @@ void Simple() { TEST_F(BatchNormOpTest, SimpleCPU) { Simple(); } -TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple(); } +TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple(); } TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // generate random input @@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}); net.TransformDataFormat("Input", NHWC, @@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", + BufferToImage(&net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", + BufferToImage(&net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // Tuning setenv("MACE_TUNING", "1", 1); - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); unsetenv("MACE_TUNING"); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); } @@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}); net.TransformDataFormat("Input", NHWC, @@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", + BufferToImage(&net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", + BufferToImage(&net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // Tuning setenv("MACE_TUNING", "1", 1); - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); unsetenv("MACE_TUNING"); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); } @@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}); net.TransformDataFormat("Input", NHWC, @@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", + BufferToImage(&net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", + BufferToImage(&net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // tuning setenv("MACE_TUNING", "1", 1); - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); unsetenv("MACE_TUNING"); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); } @@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}); net.TransformDataFormat("Input", NHWC, @@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", + BufferToImage(&net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", + BufferToImage(&net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") @@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // tuning setenv("MACE_TUNING", "1", 1); - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); unsetenv("MACE_TUNING"); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); } diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index 7966657731af26002aee48daa1b0788cbeba82b5..fc10cc5845efcc220d7a08d9084b8fafd0d3a130 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -20,15 +20,15 @@ namespace ops { void Register_BatchToSpaceND(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BatchToSpaceNDOp); + BatchToSpaceNDOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BatchToSpaceNDOp); + BatchToSpaceNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 447b84c405c468ab5f23309f8e989d6a67383277..5cfe7015cf531d73c22af4f0ac6ccc3b3292d9bc 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -63,7 +63,7 @@ void BMBatchToSpace( BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, OPENCL); + BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index b2d217dceba554056a69afb83774f4a1d3ec88e6..dd4e20f4cfd8bb7fed973ca60fd2f5a334876afd 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BiasAddOp); + BiasAddOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BiasAddOp); + BiasAddOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 8dbf709d100cf925fd0f60e1fb8ff8266f979519..851c8a17fc28fb50dc8f33b6472b00f618e9adfe 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { .Input("Bias") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Bias", "BiasImage", @@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { #define BM_BIAS_ADD(N, C, H, W) \ BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ - BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \ - BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL); + BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ + BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); BM_BIAS_ADD(1, 1, 512, 512); BM_BIAS_ADD(1, 3, 128, 128); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 5f3aa0c1cda1878664e89077ba6af289ce245797..2c4a57739d6b083073e5f7555ffd7b5a99e88c4f 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -47,7 +47,7 @@ void BiasAddSimple() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Bias", "BiasImage", @@ -80,7 +80,7 @@ void BiasAddSimple() { TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple(); } TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { - BiasAddSimple(); + BiasAddSimple(); } TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { @@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Bias", {channels}, true); + net.AddRandomInput("Bias", {channels}, true); net.TransformDataFormat("Input", NHWC, @@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BiasAdd", "BiasAddTest") @@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); } @@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Bias", {channels}, true); + net.AddRandomInput("Bias", {channels}, true); net.TransformDataFormat("Input", NHWC, @@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BiasAdd", "BiasAddTest") @@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); } diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc index 9270b2bf2a7065469861ac5841df1616380df66e..5a567d7495f4a1abe5cad4eb7389bf19a36e21fb 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_to_image.cc @@ -19,16 +19,16 @@ namespace ops { void Register_BufferToImage(OperatorRegistry *op_registry) { REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BufferToImageOp); + BufferToImageOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BufferToImageOp); + BufferToImageOp); } } // namespace ops diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index e75a6ae1b6e12d9af807fead81aa0134cc076992..de2e76f8272da1c078a679e0217c06fb57f3ae30 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type, } // namespace TEST(BufferToImageTest, ArgSmall) { - TestBidirectionTransform(kernels::ARGUMENT, {1}); + TestBidirectionTransform(kernels::ARGUMENT, {1}); } TEST(BufferToImageTest, ArgHalfSmall) { - TestBidirectionTransform(kernels::ARGUMENT, {11}); + TestBidirectionTransform(kernels::ARGUMENT, {11}); } TEST(BufferToImageTest, ArgMedia) { - TestBidirectionTransform(kernels::ARGUMENT, {11}); + TestBidirectionTransform(kernels::ARGUMENT, {11}); } TEST(BufferToImageTest, ArgLarge) { - TestBidirectionTransform(kernels::ARGUMENT, {256}); + TestBidirectionTransform(kernels::ARGUMENT, {256}); } TEST(BufferToImageTest, InputSmallSingleChannel) { - TestBidirectionTransform(kernels::IN_OUT_CHANNEL, + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1}); } TEST(BufferToImageTest, InputSmallMultipleChannel) { - TestBidirectionTransform(kernels::IN_OUT_CHANNEL, + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3}); } TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { - TestBidirectionTransform(kernels::IN_OUT_CHANNEL, + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3}); } TEST(BufferToImageTest, InputMedia) { - TestBidirectionTransform(kernels::IN_OUT_CHANNEL, + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128}); } TEST(BufferToImageTest, InputLarge) { - TestBidirectionTransform(kernels::IN_OUT_CHANNEL, + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256}); } TEST(BufferToImageTest, Filter1x1Small) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 3, 5}); } TEST(BufferToImageTest, Filter1x1Media) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 13, 17}); } TEST(BufferToImageTest, Filter1x1Large) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 128, 512}); } TEST(BufferToImageTest, Filter3x3Small) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 3, 5}); } TEST(BufferToImageTest, Filter3x3Meida) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 13, 17}); } TEST(BufferToImageTest, Filter3x3Large) { - TestBidirectionTransform(kernels::CONV2D_FILTER, + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 128, 256}); } @@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type, } // namespace TEST(BufferToImageTest, ArgFloatToHalfSmall) { - TestDiffTypeBidirectionTransform(kernels::ARGUMENT, + TestDiffTypeBidirectionTransform(kernels::ARGUMENT, {11}); } @@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { const unsigned char input_data[] = { 0xCD, 0x3C, 0x33, 0x40, }; - TestStringHalfBidirectionTransform( + TestStringHalfBidirectionTransform( kernels::ARGUMENT, {2}, input_data); } diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 980a81d5ee0ae79157b8b5dffd548f008acbaeba..f7d23e31ae0788dd29d0b60661894667b76aacce 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ChannelShuffleOp); + ChannelShuffleOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ChannelShuffleOp); + ChannelShuffleOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h index aa2fbe02b48abf8eb30cb47a79f0602931f05f83..e8e3139e38612cc56794e0212cccd89c116e856c 100644 --- a/mace/ops/channel_shuffle.h +++ b/mace/ops/channel_shuffle.h @@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); int channels; - if (D == OPENCL) { + if (D == GPU) { channels = input->dim(3); } else if (D == CPU) { channels = input->dim(1); diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index a179414f65f11b77aa9a2c1b657c9d3cde5c750b..205d74e54145a801962c577e39f11085748af883 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -31,7 +31,7 @@ void ChannelShuffle( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, height, channels, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -42,7 +42,7 @@ void ChannelShuffle( .Input("Input") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -82,8 +82,8 @@ void ChannelShuffle( #define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL); + BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ + BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index 8c5df87750cff0bbc879263ad9e0b24b09a585c1..7c17c5a0c4a2f70248d938df7c9fbb69293d532f 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") @@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", + ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); // Check diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 55e2e9d09fc15f4b019366f55fc825b604d332a8..7f6f6fd893527e41b79e61997dd60460ed2ab81d 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ConcatOp); + ConcatOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ConcatOp); + ConcatOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 803b61c50211c3de08b431dd8b1918fa3e84cfcc..af6f2943e2eb123c25beed279c9a8cad6f8a7e13 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters, OpsTestNet net; // Add input data - net.AddRandomInput("Input0", shape0); - net.AddRandomInput("Input1", shape1); + net.AddRandomInput("Input0", shape0); + net.AddRandomInput("Input1", shape1); - BufferToImage(&net, "Input0", "InputImage0", + BufferToImage(&net, "Input0", "InputImage0", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", + BufferToImage(&net, "Input1", "InputImage1", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Concat", "ConcatBM") .Input("InputImage0") @@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters, // Warm-up for (int i = 0; i < 5; ++i) { - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); } const int64_t tot = @@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters, testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); } } } // namespace diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index a7243a3a667efc0d768dfbec0acfe8d9187ee865..78d14394f072be0ffc0fa1be03be07b45832e68c 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector> &shapes, concat_axis_size += shapes[i][axis]; GenerateRandomRealTypeData(shapes[i], &inputs[i]); input_ptrs[i] = inputs[i].data(); - net.AddInputFromArray(input_name, + net.AddInputFromArray(input_name, shapes[i], inputs[i]); - BufferToImage(&net, input_name, image_name, + BufferToImage(&net, input_name, image_name, kernels::BufferType::IN_OUT_CHANNEL); } @@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector> &shapes, .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "Output", + ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); // Check diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index aaff91f454aa817c340e42b45da3b557b29e3ed0..e5f01d26cd5f1a1b1e4bb428b74342e0d529caec 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - Conv2dOp); + Conv2dOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - Conv2dOp); + Conv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 690ef002c7c4781ad6f19de4554ec7a060fbf872..f7fc157df3deb5daae086e5bd39eb8cb72f7db39 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -16,6 +16,7 @@ #define MACE_OPS_CONV_2D_H_ #include +#include #include "mace/core/operator.h" #include "mace/kernels/conv_2d.h" @@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase { this->padding_type_, this->paddings_, this->dilations_.data(), - kernels::ActivationType::NOOP, - 0.0f, + kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), static_cast(OperatorBase::GetSingleArgument( "is_filter_transformed", false)), ws->GetScratchBuffer(D)) {} diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index dcb981d828d41d1c3a51c0c6531a98824c64362a..30dcf736c0d74eb0d91568bbdf00a24566d7b963 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -47,7 +47,7 @@ void Conv2d(int iters, {output_channels, channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Filter", {kernel_h, kernel_w, output_channels, @@ -68,7 +68,7 @@ void Conv2d(int iters, .AddIntsArg("dilations", {dilation, dilation}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -138,8 +138,8 @@ void Conv2d(int iters, #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 9880ca72a53b57354bad9fdc39602599ef49f914..2f51df772ce579cc4c95619a5bca9501837d1505 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) { } TEST_F(Conv2dOpTest, OPENCLSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); } namespace { @@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) { } TEST_F(Conv2dOpTest, OPENCLWithoutBias) { - TestNHWCSimple3x3WithoutBias(); + TestNHWCSimple3x3WithoutBias(); } namespace { @@ -301,7 +301,7 @@ void TestNHWCCombined3x3() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) { } TEST_F(Conv2dOpTest, OPENCLStride2) { - TestNHWCCombined3x3(); + TestNHWCCombined3x3(); } +namespace { +template +void TestFusedNHWCSimple3x3VALID() { + OpsTestNet net; + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 1, 2}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + net.AddInputFromArray("Bias", {1}, {-0.1f}); + + if (D == DeviceType::CPU) { + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); + net.TransformDataFormat("Filter", + HWOI, + "FilterOIHW", + OIHW); + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputNCHW") + .Input("FilterOIHW") + .Input("Bias") + .Output("OutputNCHW") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + net.TransformDataFormat("OutputNCHW", + NCHW, + "Output", + NHWC); + } else if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + kernels::BufferType::CONV2D_FILTER); + BufferToImage(&net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(&net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); + + } else { + MACE_NOT_IMPLEMENTED; + } + + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + ExpectTensorNear(*expected, *net.GetOutput("Output")); +} +template +void TestFusedNHWCSimple3x3WithoutBias() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 1, 2}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + + if (D == DeviceType::CPU) { + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); + net.TransformDataFormat("Filter", + HWOI, + "FilterOIHW", + OIHW); + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputNCHW") + .Input("FilterOIHW") + .Output("OutputNCHW") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + net.TransformDataFormat("OutputNCHW", + NCHW, + "Output", + NHWC); + } else if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + kernels::BufferType::CONV2D_FILTER); + + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + // Transfer output + ImageToBuffer(&net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); + } else { + MACE_NOT_IMPLEMENTED; + } + + // Check + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output")); +} + +} // namespace + +TEST_F(Conv2dOpTest, FusedCPUSimple) { + TestFusedNHWCSimple3x3VALID(); + TestFusedNHWCSimple3x3WithoutBias(); +} + +TEST_F(Conv2dOpTest, FusedOPENCLSimple) { + TestFusedNHWCSimple3x3VALID(); + TestFusedNHWCSimple3x3WithoutBias(); +} + + namespace { template void TestConv1x1() { @@ -389,7 +544,7 @@ void TestConv1x1() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -431,7 +586,7 @@ void TestConv1x1() { TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); } -TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } +TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } namespace { template @@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector &shape, } // namespace TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { - TestComplexConvNxNS12({32, 16, 16, 32}, 1); - TestComplexConvNxNS12({32, 16, 16, 32}, 2); + TestComplexConvNxNS12({32, 16, 16, 32}, 1); + TestComplexConvNxNS12({32, 16, 16, 32}, 2); } TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { - TestComplexConvNxNS12({17, 113, 5, 7}, 1); - TestComplexConvNxNS12({17, 113, 5, 7}, 2); + TestComplexConvNxNS12({17, 113, 5, 7}, 1); + TestComplexConvNxNS12({17, 113, 5, 7}, 2); } TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { - TestComplexConvNxNS12({31, 113, 13, 17}, 3); - TestComplexConvNxNS12({32, 32, 13, 17}, 4); + TestComplexConvNxNS12({31, 113, 13, 17}, 3); + TestComplexConvNxNS12({32, 32, 13, 17}, 4); } namespace { @@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, } // namespace TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { - TestHalfComplexConvNxNS12({32, 32}, {1, 1, 32, 64}, + TestHalfComplexConvNxNS12({32, 32}, {1, 1, 32, 64}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { - TestHalfComplexConvNxNS12({32, 32}, {3, 3, 32, 64}, + TestHalfComplexConvNxNS12({32, 32}, {3, 3, 32, 64}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { - TestHalfComplexConvNxNS12({32, 32}, {15, 1, 256, 2}, + TestHalfComplexConvNxNS12({32, 32}, {15, 1, 256, 2}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { - TestHalfComplexConvNxNS12({32, 32}, {1, 15, 256, 2}, + TestHalfComplexConvNxNS12({32, 32}, {1, 15, 256, 2}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { - TestHalfComplexConvNxNS12({32, 32}, {7, 7, 3, 64}, + TestHalfComplexConvNxNS12({32, 32}, {7, 7, 3, 64}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { - TestHalfComplexConvNxNS12({107, 113}, {1, 1, 5, 7}, + TestHalfComplexConvNxNS12({107, 113}, {1, 1, 5, 7}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { - TestHalfComplexConvNxNS12({107, 113}, {3, 3, 5, 7}, + TestHalfComplexConvNxNS12({107, 113}, {3, 3, 5, 7}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) { - TestHalfComplexConvNxNS12({64, 64}, {5, 5, 16, 16}, + TestHalfComplexConvNxNS12({64, 64}, {5, 5, 16, 16}, {2, 2}); } TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) { - TestHalfComplexConvNxNS12({64, 64}, {7, 7, 16, 16}, + TestHalfComplexConvNxNS12({64, 64}, {7, 7, 16, 16}, {2, 2}); } TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) { - TestHalfComplexConvNxNS12({63, 67}, {7, 7, 16, 16}, + TestHalfComplexConvNxNS12({63, 67}, {7, 7, 16, 16}, {4, 4}); } @@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector &shape, } // namespace TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) { - TestDilationConvNxN({32, 32, 32, 64}, 2); + TestDilationConvNxN({32, 32, 32, 64}, 2); } TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) { - TestDilationConvNxN({128, 128, 16, 16}, 4); + TestDilationConvNxN({128, 128, 16, 16}, 4); } TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) { - TestDilationConvNxN({107, 113, 5, 7}, 4); + TestDilationConvNxN({107, 113, 5, 7}, 4); +} + +namespace { +template +void TestGeneralHalfAtrousConv(const std::vector &image_shape, + const std::vector &filter_shape, + const std::vector &dilations) { + testing::internal::LogToStderr(); + auto func = [&](int stride_h, int stride_w, Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 1; + index_t height = image_shape[0]; + index_t width = image_shape[1]; + index_t kernel_h = filter_shape[0]; + index_t kernel_w = filter_shape[1]; + index_t output_channels = filter_shape[2]; + index_t input_channels = filter_shape[3]; + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + net.AddRandomInput("Bias", {output_channels}); + + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); + net.TransformDataFormat("Filter", + HWOI, + "FilterOIHW", + OIHW); + + // Construct graph + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputNCHW") + .Input("FilterOIHW") + .Input("Bias") + .Output("OutputNCHW") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", dilations) + .Finalize(net.NewOperatorDef()); + + // run on cpu + net.RunOp(); + + net.TransformDataFormat("OutputNCHW", + NCHW, + "Output", + NHWC); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + kernels::BufferType::CONV2D_FILTER); + BufferToImage(&net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", dilations) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + kernels::BufferType::IN_OUT_CHANNEL); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), + 1e-2, 1e-1); + }; + + func(1, 1, VALID); + func(1, 1, SAME); +} +} // namespace + +TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) { + TestGeneralHalfAtrousConv({32, 32}, {7, 7, 16, 3}, + {2, 2}); +} + +TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) { + TestGeneralHalfAtrousConv({63, 71}, {15, 15, 16, 16}, + {2, 2}); } namespace { @@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector &shape, } // namespace TEST_F(Conv2dOpTest, OPENCLAlignedPad1) { - TestArbitraryPadConvNxN({32, 32, 32, 64}, {1, 1}); + TestArbitraryPadConvNxN({32, 32, 32, 64}, {1, 1}); } TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { - TestArbitraryPadConvNxN({128, 128, 16, 16}, + TestArbitraryPadConvNxN({128, 128, 16, 16}, {2, 2}); } TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { - TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); + TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); } } // namespace test diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index 81ef5f53f36753d71b1a54cc29af530611edd13e..1874a178fe970e5e78b865d65b300a8115b02e5e 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) { .Finalize(&op_defs[op_defs.size() - 1]); Tensor *input = - ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::OPENCL), + ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v()); input->Resize({1, 3, 3, 3}); { @@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) { } std::shared_ptr op_registry(new OperatorRegistry()); auto net = - CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL, NetMode::INIT); + CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT); net->Run(); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr); - net = CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL); + net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU); net->Run(); EXPECT_TRUE(ws.GetTensor("Output") != nullptr); diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index f98b06ce4c29dbc330b7214eddbc61841cd7472c..2f0e38c4a3ea48f4c09119f0c791f34f8fe8f9fc 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - DepthToSpaceOp); + DepthToSpaceOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - DepthToSpaceOp); + DepthToSpaceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 48c7111c877a49bb3f5dfe21050d2598be838ad3..1e923edc5bd0a7cf19e9e217a4244a429c519b43 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator { int input_depth; if (D == CPU) { input_depth = input->dim(1); - } else if (D == OPENCL) { + } else if (D == GPU) { input_depth = input->dim(3); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index 5f95ab3bb126ce13a8a10c352ac54082b71142a9..bf05f6929cdd292beaa69854d1ef04b1ed159cda 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -31,7 +31,7 @@ void DepthToSpace( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -42,7 +42,7 @@ void DepthToSpace( .Input("Input") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -82,8 +82,8 @@ void DepthToSpace( #define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL); + BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ + BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 4012bfb8c18caf82b141e5b3e9cc04dac275ac33..692d2d62b2ceb3c16f1edac6079cb8955dc0dbb8 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s, } - if (D == DeviceType::OPENCL) { - ImageToBuffer(&net, "OutputImage", "Output", + if (D == DeviceType::GPU) { + ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); } auto expected = CreateTensor(expected_shape, expected_data); @@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { } TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { - RunDepthToSpace(false, {1, 2, 4, 4}, + RunDepthToSpace(false, {1, 2, 4, 4}, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 2, @@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { } TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { - RunDepthToSpace(false, {1, 2, 2, 4}, + RunDepthToSpace(false, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, @@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { } TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { - RunDepthToSpace(true, {1, 1, 2, 16}, + RunDepthToSpace(true, {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 2, @@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { } TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { - RunDepthToSpace(true, {1, 1, 1, 16}, + RunDepthToSpace(true, {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, @@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { const std::vector in = std::vector(192 * 192 *128, 1.0); - RunDepthToSpace(true, {1, 192, 192, 128}, + RunDepthToSpace(true, {1, 192, 192, 128}, in, 2, {1, 384, 384, 32}, @@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size, } // namespace TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { - RandomTest(true, 2, {1, 192, 192, 128}); + RandomTest(true, 2, {1, 192, 192, 128}); } TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { -RandomTest(true, 2, {1, 192, 192, 128}); +RandomTest(true, 2, {1, 192, 192, 128}); } TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { -RandomTest(false, 2, {1, 384, 384, 32}); +RandomTest(false, 2, {1, 384, 384, 32}); } TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { -RandomTest(false, 2, {1, 384, 384, 32}); +RandomTest(false, 2, {1, 384, 384, 32}); } } // namespace test diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index ac6a3b6c5bef2886d9708a2f82e397d3dbe048e2..fc0205dbaa6c6f545b62d8a71460aae71e8d804e 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - DepthwiseConv2dOp); + DepthwiseConv2dOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - DepthwiseConv2dOp); + DepthwiseConv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 9074089e7bb07759540b127c176f2107c5756c97..38ef26160d97b33e98bc2aab85679e32c9d0e32f 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters, net.AddRandomInput( "Filter", {multiplier, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {input_channels * multiplier}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( @@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters, .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters, #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 509c18888ce50156cb86c6a72fa8b8e65d5257d2..825396e1ba7c27c931f33ff5a66a7f71000c9e62 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -59,7 +59,7 @@ void SimpleValidTest() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { } TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) { - SimpleValidTest(); + SimpleValidTest(); } TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { - SimpleValidTest(); + SimpleValidTest(); } namespace { @@ -184,7 +184,7 @@ void ComplexValidTest() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Filter", "FilterImage", @@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) { } TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) { - ComplexValidTest(); + ComplexValidTest(); } TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { - ComplexValidTest(); + ComplexValidTest(); } namespace { @@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) { OpsTestNet net; // Add input data - net.AddRandomInput("Input", + net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( + net.AddRandomInput( "Filter", {kernel_h, kernel_w, input_channels, multiplier}); - net.AddRandomInput("Bias", + net.AddRandomInput("Bias", {multiplier * input_channels}); @@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) { Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", + BufferToImage(&net, "Filter", "FilterImage", kernels::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputImage") @@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) { .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); // Transfer output - ImageToBuffer(&net, + ImageToBuffer(&net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT_CHANNEL); diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 74b2b1e671d702afbf364789f412009334af7221..427716eadb1c6ee50b40c74461835e31c7419049 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - EltwiseOp); + EltwiseOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - EltwiseOp); + EltwiseOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 8c3843a6437621e8414abd7f82cb2f567262d3fd..55308069e4d79801b2d67dfeb9bcf9ee01f2ad9b 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -34,11 +34,11 @@ void EltwiseBenchmark( net.AddRandomInput("Input0", {n, h, w, c}); net.AddRandomInput("Input1", {n, h, w, c}); - if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input0", "InputImg0", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - kernels::BufferType::IN_OUT_CHANNEL); + if (D == DeviceType::GPU) { + BufferToImage(&net, "Input0", "InputImg0", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Input1", "InputImg1", + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") .Input("InputImg0") .Input("InputImg1") @@ -90,8 +90,8 @@ void EltwiseBenchmark( #define BM_ELTWISE(ELT_TYPE, N, H, W, C) \ BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL); + BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ + BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); BM_ELTWISE(2, 1, 128, 128, 32); BM_ELTWISE(2, 1, 240, 240, 256); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 7685d43616f9d92512b21ba332b0ee2caa7374f1..a156d95f54953aa2166676bf2dfa75bff9947a29 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -183,49 +183,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) { } TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) { - SimpleTensorScalar(kernels::EltwiseType::SUM, - {1, 1, 1, 1}, {1}, 1, - {2}); - SimpleTensorScalar(kernels::EltwiseType::SUB, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 1, - {0, 1, 2, 3, 4, 5}); - SimpleTensorScalar(kernels::EltwiseType::PROD, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 2, - {2, 4, 6, 8, 10, 12}); - SimpleTensorScalar(kernels::EltwiseType::DIV, - {1, 1, 2, 3}, - {2, 4, 6, 8, 10, 12}, - 2, - {1, 2, 3, 4, 5, 6}); - SimpleTensorScalar(kernels::EltwiseType::MIN, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 1, - {1, 1, 1, 1, 1, 1}); - SimpleTensorScalar(kernels::EltwiseType::MAX, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 3, - {3, 3, 3, 4, 5, 6}); - SimpleTensorScalar(kernels::EltwiseType::NEG, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 3, - {-1, -2, -3, -4, -5, -6}); - SimpleTensorScalar(kernels::EltwiseType::ABS, - {1, 1, 2, 3}, - {-1, -2, -3, -4, -5, -6}, - 3, - {1, 2, 3, 4, 5, 6}); - SimpleTensorScalar(kernels::EltwiseType::SQR_DIFF, - {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, - 1, - {0, 1, 4, 9, 16, 25}); + SimpleTensorScalar(kernels::EltwiseType::SUM, + {1, 1, 1, 1}, {1}, 1, + {2}); + SimpleTensorScalar(kernels::EltwiseType::SUB, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 1, + {0, 1, 2, 3, 4, 5}); + SimpleTensorScalar(kernels::EltwiseType::PROD, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 2, + {2, 4, 6, 8, 10, 12}); + SimpleTensorScalar(kernels::EltwiseType::DIV, + {1, 1, 2, 3}, + {2, 4, 6, 8, 10, 12}, + 2, + {1, 2, 3, 4, 5, 6}); + SimpleTensorScalar(kernels::EltwiseType::MIN, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 1, + {1, 1, 1, 1, 1, 1}); + SimpleTensorScalar(kernels::EltwiseType::MAX, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 3, + {3, 3, 3, 4, 5, 6}); + SimpleTensorScalar(kernels::EltwiseType::NEG, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 3, + {-1, -2, -3, -4, -5, -6}); + SimpleTensorScalar(kernels::EltwiseType::ABS, + {1, 1, 2, 3}, + {-1, -2, -3, -4, -5, -6}, + 3, + {1, 2, 3, 4, 5, 6}); + SimpleTensorScalar(kernels::EltwiseType::SQR_DIFF, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + 1, + {0, 1, 4, 9, 16, 25}); } TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { @@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { } TEST_F(EltwiseOpTest, GPUSimpleTensorVector) { - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3}, {1, 2, 3}, {2, 4, 6, 5, 7, 9}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5}, {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, @@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) { 25}); } TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) { - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); - SimpleTensorEltwise( + SimpleTensorEltwise( kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, @@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, OpsTestNet net; // Add input data - net.AddRandomInput("Input", shape, true, true); + net.AddRandomInput("Input", shape, true, true); net.TransformDataFormat("Input", NHWC, @@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImg", + BufferToImage(&net, "Input", "InputImg", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") .Input("InputImg") @@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", + ImageToBuffer(&net, "OutputImg", "GPUOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } @@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type, OpsTestNet net; // Add input data - net.AddRandomInput("Input0", shape0, true, true); - net.AddRandomInput("Input1", shape1, true, true); + net.AddRandomInput("Input0", shape0, true, true); + net.AddRandomInput("Input1", shape1, true, true); net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); @@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type, Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input0", "InputImg0", + BufferToImage(&net, "Input0", "InputImg0", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", + BufferToImage(&net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") .Input("InputImg0") @@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", + ImageToBuffer(&net, "OutputImg", "GPUOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } @@ -609,19 +609,19 @@ TEST_F(EltwiseOpTest, RandomTensorTensorFloat) { TEST_F(EltwiseOpTest, RandomTensorTensorHalf) { RandomTensorEltwise(kernels::EltwiseType::SUM, - {1, 32, 32, 16}, {1, 32, 32, 16}); + {1, 32, 32, 16}, {1, 32, 32, 16}); RandomTensorEltwise(kernels::EltwiseType::SUB, - {3, 32, 32, 16}, {3, 32, 32, 16}); + {3, 32, 32, 16}, {3, 32, 32, 16}); RandomTensorEltwise(kernels::EltwiseType::PROD, - {1, 31, 37, 17}, {1, 31, 37, 17}); + {1, 31, 37, 17}, {1, 31, 37, 17}); RandomTensorEltwise(kernels::EltwiseType::DIV, - {5, 31, 37, 17}, {5, 31, 37, 17}); + {5, 31, 37, 17}, {5, 31, 37, 17}); RandomTensorEltwise(kernels::EltwiseType::MIN, - {1, 32, 32, 16}, {1, 32, 32, 16}); + {1, 32, 32, 16}, {1, 32, 32, 16}); RandomTensorEltwise(kernels::EltwiseType::MAX, - {3, 31, 37, 17}, {3, 31, 37, 17}); + {3, 31, 37, 17}, {3, 31, 37, 17}); RandomTensorEltwise(kernels::EltwiseType::SQR_DIFF, - {3, 31, 37, 17}, {3, 31, 37, 17}); + {3, 31, 37, 17}, {3, 31, 37, 17}); } diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc index 3f5a9a14b11453516cb1e0be6959b7551e1d047b..6c46195d88579771ab45f5abe3ceb43189a75678 100644 --- a/mace/ops/folded_batch_norm.cc +++ b/mace/ops/folded_batch_norm.cc @@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - FoldedBatchNormOp); + FoldedBatchNormOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - FoldedBatchNormOp); + FoldedBatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index f5e12d49c76c66c4f6448f434c33109bfa1207a0..dcb35b8d34a451767d233d923feeb976e757c645 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -60,7 +60,7 @@ void Simple() { // Run net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Scale", "ScaleImage", @@ -94,7 +94,7 @@ void Simple() { TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple(); } -TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple(); } +TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple(); } TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { // generate random input @@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); net.TransformDataFormat("Input", NHWC, @@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") @@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); } @@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); net.TransformDataFormat("Input", NHWC, @@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") @@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); } @@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); net.TransformDataFormat("Input", NHWC, @@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") @@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); } @@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); net.TransformDataFormat("Input", NHWC, @@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", + BufferToImage(&net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", + BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") @@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); } diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 06318d7f03e14e5b4d24fbbc90bbb48b4cb76415..acd2f6b90113966a945f2948e947c3ff403ce818 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - FullyConnectedOp); + FullyConnectedOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - FullyConnectedOp); + FullyConnectedOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index fdb784e57fc8b34f9b468b4597e28f3cddf0035f..06127cead9ea21179775797883080f98b2ec0838 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -43,7 +43,7 @@ void FCBenchmark( .Input("Bias") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH; BufferToImage(&net, "Weight", "WeightImage", weight_type); @@ -93,8 +93,8 @@ void FCBenchmark( #define BM_FC(N, H, W, C, OC) \ BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ - BM_FC_MACRO(N, H, W, C, OC, float, OPENCL); \ - BM_FC_MACRO(N, H, W, C, OC, half, OPENCL); + BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ + BM_FC_MACRO(N, H, W, C, OC, half, GPU); BM_FC(1, 16, 16, 32, 32); BM_FC(1, 8, 8, 32, 1000); diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index c5ba2f1ce582948089f1359d2e1a66879437569c..97afa2d4890138612107ef12dd8e2dc40a9b733d 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -51,7 +51,7 @@ void Simple(const std::vector &input_shape, // Run net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Weight", "WeightImage", @@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { } TEST_F(FullyConnectedOpTest, SimpleOPENCL) { - Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8}, + Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1}, {206}); - Simple( + Simple( {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); - Simple( + Simple( {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6}, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6}, @@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) { } TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { - Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4}, + Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4}, {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); } @@ -136,11 +136,11 @@ void Complex(const index_t batch, OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput( + net.AddRandomInput( "Weight", {out_channel, height * width * channels}); - net.AddRandomInput("Bias", {out_channel}); + net.AddRandomInput("Bias", {out_channel}); OpDefBuilder("FC", "FullyConnectedTest") .Input("Input") @@ -159,11 +159,11 @@ void Complex(const index_t batch, expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", + BufferToImage(&net, "Weight", "WeightImage", kernels::BufferType::WEIGHT_HEIGHT); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FC", "FullyConnectedTest") @@ -176,9 +176,9 @@ void Complex(const index_t batch, .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), @@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch, OpsTestNet net; // Add input data - net.AddRandomInput( + net.AddRandomInput( "Input", {batch, height, width, channels}); - net.AddRandomInput( + net.AddRandomInput( "Weight", {out_channel, height * width * channels}); - net.AddRandomInput("Bias", {out_channel}); + net.AddRandomInput("Bias", {out_channel}); OpDefBuilder("FC", "FullyConnectedTest") .Input("Input") @@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch, expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", + BufferToImage(&net, "Weight", "WeightImage", kernels::BufferType::WEIGHT_WIDTH); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FC", "FullyConnectedTest") @@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch, .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), diff --git a/mace/ops/fused_conv_2d.cc b/mace/ops/fused_conv_2d.cc deleted file mode 100644 index cfe1a5db316abdbb5112aa4eb599c54f0cc72cb9..0000000000000000000000000000000000000000 --- a/mace/ops/fused_conv_2d.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/fused_conv_2d.h" - -namespace mace { -namespace ops { - -void Register_FusedConv2D(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FusedConv2dOp); - -#ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - FusedConv2dOp); - - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - FusedConv2dOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h deleted file mode 100644 index a2a255ef5242457e2995da418fc881383fbc2512..0000000000000000000000000000000000000000 --- a/mace/ops/fused_conv_2d.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_FUSED_CONV_2D_H_ -#define MACE_OPS_FUSED_CONV_2D_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/conv_2d.h" -#include "mace/ops/conv_pool_2d_base.h" - -namespace mace { -namespace ops { - -template -class FusedConv2dOp : public ConvPool2dOpBase { - public: - FusedConv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), - this->padding_type_, - this->paddings_, - this->dilations_.data(), - kernels::StringToActivationType( - OperatorBase::GetSingleArgument("activation", - "NOOP")), - OperatorBase::GetSingleArgument("max_limit", 0.0f), - static_cast(OperatorBase::GetSingleArgument( - "is_filter_transformed", false)), - ws->GetScratchBuffer(D)) {} - - bool Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *filter = this->Input(FILTER); - const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr; - Tensor *output = this->Output(OUTPUT); - - functor_(input, filter, bias, output, future); - - return true; - } - - private: - kernels::Conv2dFunctor functor_; - - protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_FUSED_CONV_2D_H_ diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc deleted file mode 100644 index 6e3099a3e59c4d7c7f0e4ef6ae984deb737c3a1c..0000000000000000000000000000000000000000 --- a/mace/ops/fused_conv_2d_test.cc +++ /dev/null @@ -1,859 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "mace/ops/fused_conv_2d.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -class FusedConv2dOpTest : public OpsTestBase {}; - -namespace { -template -void TestNHWCSimple3x3VALID() { - OpsTestNet net; - // Add input data - net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); - net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {-0.1f}); - - if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - } else if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); - - } else { - MACE_NOT_IMPLEMENTED; - } - - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); - ExpectTensorNear(*expected, *net.GetOutput("Output")); -} - -template -void TestNHWCSimple3x3SAME() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); - net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {-0.1f}); - - if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - } else if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); - - } else { - MACE_NOT_IMPLEMENTED; - } - - auto expected = CreateTensor( - {1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); - - ExpectTensorNear(*expected, *net.GetOutput("Output")); -} -} // namespace - -TEST_F(FusedConv2dOpTest, CPUSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); -} - -TEST_F(FusedConv2dOpTest, OPENCLSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); -} - -namespace { -template -void TestNHWCSimple3x3WithoutBias() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); - net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - - if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Output("OutputNCHW") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - } else if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); - } else { - MACE_NOT_IMPLEMENTED; - } - - // Check - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); - - ExpectTensorNear(*expected, *net.GetOutput("Output")); -} -} // namespace - -TEST_F(FusedConv2dOpTest, CPUWithoutBias) { - TestNHWCSimple3x3WithoutBias(); -} - -TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) { - TestNHWCSimple3x3WithoutBias(); -} - -namespace { -template -void TestConv1x1() { - // Construct graph - OpsTestNet net; - - // Add input data - net.AddInputFromArray( - "Input", {1, 3, 10, 5}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( - "Filter", {1, 1, 2, 5}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); - - if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - } else if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); - } else { - MACE_NOT_IMPLEMENTED; - } - - // Check - auto expected = CreateTensor( - {1, 3, 10, 2}, - {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); - - ExpectTensorNear(*expected, *net.GetOutput("Output")); -} -} // namespace - -TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1(); } - -TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } - -namespace { -template -void TestComplexConvNxNS12(const std::vector &shape) { - testing::internal::LogToStderr(); - auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, - Padding type) { - // generate random input - static unsigned int seed = time(NULL); - index_t batch = 3 + (rand_r(&seed) % 10); - index_t height = shape[0]; - index_t width = shape[1]; - index_t input_channels = shape[2] + (rand_r(&seed) % 10); - index_t output_channels = shape[3] + (rand_r(&seed) % 10); - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); - net.AddRandomInput("Bias", {output_channels}); - - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - - // Construct graph - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // run on cpu - net.RunOp(); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run on gpu - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-5, 1e-4); - }; - - for (int kernel_size : {1, 3}) { - for (int stride : {1, 2}) { - func(kernel_size, kernel_size, stride, stride, VALID); - func(kernel_size, kernel_size, stride, stride, SAME); - } - } -} -} // namespace - -TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) { - TestComplexConvNxNS12({107, 113, 5, 7}); -} - -namespace { -template -void TestHalfComplexConvNxNS12(const std::vector &shape, - const int kernel, const int stride, - Padding type) { - testing::internal::LogToStderr(); - // generate random input - srand(time(NULL)); - index_t batch = 3; - index_t height = shape[0]; - index_t width = shape[1]; - index_t input_channels = shape[2]; - index_t output_channels = shape[3]; - - OpsTestNet net; - - std::vector float_input_data; - GenerateRandomRealTypeData({batch, height, width, input_channels}, - &float_input_data); - std::vector float_filter_data; - GenerateRandomRealTypeData( - {kernel, kernel, output_channels, input_channels}, - &float_filter_data); - std::vector float_bias_data; - GenerateRandomRealTypeData({output_channels}, &float_bias_data); - // Add input data - net.AddInputFromArray( - "Input", {batch, height, width, input_channels}, float_input_data); - net.AddInputFromArray( - "Filter", {kernel, kernel, output_channels, input_channels}, - float_filter_data); - net.AddInputFromArray("Bias", {output_channels}, float_bias_data); - - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - - // Construct graph - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - - // run on cpu - net.RunOp(); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run on gpu - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-2, 1e-1); -} -} // namespace - -TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv1x1S12) { - TestHalfComplexConvNxNS12({32, 32, 32, 64}, 1, 1, VALID); - TestHalfComplexConvNxNS12({31, 37, 31, 37}, 1, 1, SAME); - TestHalfComplexConvNxNS12({32, 32, 32, 64}, 1, 2, VALID); - TestHalfComplexConvNxNS12({31, 37, 31, 37}, 1, 2, SAME); -} -TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv3x3S12) { - TestHalfComplexConvNxNS12({32, 32, 32, 64}, 3, 1, VALID); - TestHalfComplexConvNxNS12({31, 37, 31, 37}, 3, 1, SAME); - TestHalfComplexConvNxNS12({32, 32, 32, 64}, 3, 2, VALID); - TestHalfComplexConvNxNS12({31, 37, 31, 37}, 3, 2, SAME); -} - -namespace { -template -void TestGeneralConvNxNS12(const std::vector &image_shape, - const std::vector &filter_shape) { - testing::internal::LogToStderr(); - auto func = [&](int stride_h, int stride_w, Padding type) { - srand(time(NULL)); - - // generate random input - index_t batch = 1; - index_t height = image_shape[0]; - index_t width = image_shape[1]; - index_t kernel_h = filter_shape[0]; - index_t kernel_w = filter_shape[1]; - index_t output_channels = filter_shape[2]; - index_t input_channels = filter_shape[3]; - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); - net.AddRandomInput("Bias", {output_channels}); - - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - - // Construct graph - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // run on cpu - net.RunOp(); - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run on gpu - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-5, 1e-4); - }; - - for (int stride : {1, 2}) { - func(stride, stride, VALID); - func(stride, stride, SAME); - } -} -} // namespace - -TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) { - TestGeneralConvNxNS12({32, 32}, {7, 7, 64, 3}); -} - -TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) { - TestGeneralConvNxNS12({40, 40}, {15, 1, 64, 32}); -} - -namespace { -template -void TestAtrousConvNxN(const std::vector &shape, - const int dilation) { - testing::internal::LogToStderr(); - auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, - Padding type) { - srand(time(NULL)); - - // generate random input - index_t batch = 1; - index_t height = shape[0]; - index_t width = shape[1]; - index_t output_channels = shape[2]; - index_t input_channels = shape[3]; - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); - net.AddRandomInput("Bias", {output_channels}); - - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - - // Construct graph - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation, dilation}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // run on cpu - net.RunOp(); - - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run on gpu - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation, dilation}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-5, 1e-4); - }; - - for (int kernel_size : {3}) { - for (int stride : {1}) { - func(kernel_size, kernel_size, stride, stride, VALID); - func(kernel_size, kernel_size, stride, stride, SAME); - } - } -} -} // namespace - -TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) { - TestAtrousConvNxN({128, 128, 16, 16}, 2); -} - -TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) { - TestAtrousConvNxN({128, 128, 16, 16}, 4); -} - -TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) { - TestAtrousConvNxN({107, 113, 5, 7}, 2); -} - -namespace { -template -void TestGeneralHalfAtrousConv(const std::vector &image_shape, - const std::vector &filter_shape, - const std::vector &dilations) { - testing::internal::LogToStderr(); - auto func = [&](int stride_h, int stride_w, Padding type) { - srand(time(NULL)); - - // generate random input - index_t batch = 1; - index_t height = image_shape[0]; - index_t width = image_shape[1]; - index_t kernel_h = filter_shape[0]; - index_t kernel_w = filter_shape[1]; - index_t output_channels = filter_shape[2]; - index_t input_channels = filter_shape[3]; - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", - {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); - net.AddRandomInput("Bias", {output_channels}); - - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - - // Construct graph - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Input("Bias") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - - // run on cpu - net.RunOp(); - - net.TransformDataFormat("OutputNCHW", - NCHW, - "Output", - NHWC); - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // run on gpu - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-2, 1e-1); - }; - - func(1, 1, VALID); - func(1, 1, SAME); -} -} // namespace - -TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) { - TestGeneralHalfAtrousConv({32, 32}, {7, 7, 16, 3}, - {2, 2}); -} - -TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) { - TestGeneralHalfAtrousConv({63, 71}, {15, 15, 16, 16}, - {2, 2}); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/image_to_buffer.cc index 1dfd9321b9c500b2b0aadcd12a07f8c9cab066a7..02bcc5f70cb6c04e8a9d3c0e20832b26c7ed2ea8 100644 --- a/mace/ops/image_to_buffer.cc +++ b/mace/ops/image_to_buffer.cc @@ -19,16 +19,16 @@ namespace ops { void Register_ImageToBuffer(OperatorRegistry *op_registry) { REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ImageToBufferOp); + ImageToBufferOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ImageToBufferOp); + ImageToBufferOp); } } // namespace ops diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 2b8169499ae5bf30871f3265ccaee8e2d5b06d0d..b65bf4841b16f28807ae19dbc97e0a1b52e48c19 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -26,16 +26,16 @@ void Register_MatMul(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - MatMulOp); + MatMulOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - MatMulOp); + MatMulOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index ada15ae5e55c8092debcdbdaedd188c0336d0172..cc9b86dab67a48b2eeee9eea3352528e04d0cd94 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -34,7 +34,7 @@ void MatMulBenchmark( net.AddRandomInput("A", {batch, height, channels, 1}); net.AddRandomInput("B", {batch, channels, out_width, 1}); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH); BufferToImage(&net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT); @@ -79,8 +79,8 @@ void MatMulBenchmark( #define BM_MATMUL(N, H, C, W) \ BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ - BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \ - BM_MATMUL_MACRO(N, H, C, W, half, OPENCL); + BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ + BM_MATMUL_MACRO(N, H, C, W, half, GPU); BM_MATMUL(16, 32, 128, 49); BM_MATMUL(16, 32, 128, 961); diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index fd86b7d8f1c61dc970a26aa54422a5487866c06c..da949b80086dd515ee3c28eb804e51dc5e692127 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -37,7 +37,7 @@ void Simple(const std::vector &A_shape, net.AddInputFromArray("A", A_shape, A_value); net.AddInputFromArray("B", B_shape, B_value); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH); BufferToImage(&net, "B", "BImage", @@ -91,10 +91,10 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) { } TEST_F(MatMulOpTest, SimpleOPENCL) { - Simple({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1}, + Simple({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1}, {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64}); - Simple( + Simple( {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, @@ -127,9 +127,9 @@ void Complex(const index_t batch, .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("A", + net.AddRandomInput("A", {batch, height, channels, 1}); - net.AddRandomInput( + net.AddRandomInput( "B", {batch, channels, out_width, 1}); // run cpu @@ -140,9 +140,9 @@ void Complex(const index_t batch, expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "A", "AImage", + BufferToImage(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH); - BufferToImage(&net, "B", "BImage", + BufferToImage(&net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT); OpDefBuilder("MatMul", "MatMulTest") @@ -153,9 +153,9 @@ void Complex(const index_t batch, .Finalize(net.NewOperatorDef()); // Run on opencl - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_HEIGHT); if (DataTypeToEnum::value == DataType::DT_HALF) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 1439bf08c8adefca8524e68b6d34d74bec2deceb..b88a36c283e30cb9da00f585c3370fd4b1c0434a 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -403,7 +403,7 @@ class OpsTestNet { } void Sync() { - if (net_ && device_ == DeviceType::OPENCL) { + if (net_ && device_ == DeviceType::GPU) { OpenCLRuntime::Global()->command_queue().finish(); } } diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 5e0cd9f76af025794155e3abee4428202a050296..8bcf2028426a1dbe6085074811fb1e01c9bd4e60 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -26,15 +26,15 @@ void Register_Pad(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - PadOp); + PadOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - PadOp); + PadOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index e99a543acf6cdcfe3a8cbf57a7f6478977e61a4e..4be091dbf27880c6f396c08fd497bafbd6d00573 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -33,7 +33,7 @@ void Pad(int iters, int batch, int height, net.AddRandomInput("Input", {batch, height, width, channels}); const std::vector paddings = {0, 0, pad, pad, pad, pad, 0, 0}; - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pad", "PadTest") @@ -77,8 +77,8 @@ void Pad(int iters, int batch, int height, #define BM_PAD(N, H, W, C, PAD) \ BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ - BM_PAD_MACRO(N, H, W, C, PAD, float, OPENCL); \ - BM_PAD_MACRO(N, H, W, C, PAD, half, OPENCL); + BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ + BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); BM_PAD(1, 512, 512, 1, 2); BM_PAD(1, 112, 112, 64, 1); diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index f5069a806d99297c635c76f12b4cffbea5faf844..e244acec79efc21bbef698b97bff1fad6590ec57 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -29,7 +29,7 @@ void Simple() { // Add input data net.AddRepeatedInput("Input", {1, 2, 3, 1}, 2); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pad", "PadTest") @@ -45,15 +45,24 @@ void Simple() { ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); } else { + net.TransformDataFormat("Input", + NHWC, + "TInput", + NCHW); OpDefBuilder("Pad", "PadTest") - .Input("Input") - .Output("Output") - .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) + .Input("TInput") + .Output("TOutput") + .AddIntsArg("paddings", {0, 0, 0, 0, 1, 2, 1, 2}) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run net.RunOp(); + + net.TransformDataFormat("TOutput", + NCHW, + "Output", + NHWC); } auto output = net.GetTensor("Output"); @@ -75,7 +84,7 @@ TEST_F(PadTest, SimpleCPU) { } TEST_F(PadTest, SimpleGPU) { - Simple(); + Simple(); } TEST_F(PadTest, ComplexCPU) { @@ -84,15 +93,23 @@ TEST_F(PadTest, ComplexCPU) { // Add input data net.AddRepeatedInput("Input", {1, 1, 1, 2}, 2); + net.TransformDataFormat("Input", + NHWC, + "TInput", + NCHW); OpDefBuilder("Pad", "PadTest") - .Input("Input") - .Output("Output") + .Input("TInput") + .Output("TOutput") .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1}) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run net.RunOp(); + net.TransformDataFormat("TOutput", + NCHW, + "Output", + NHWC); auto output = net.GetTensor("Output"); @@ -109,39 +126,48 @@ TEST_F(PadTest, ComplexCPU) { namespace { template void Complex(const std::vector &input_shape, - const std::vector &paddings) { + const std::vector &cpu_paddings, + const std::vector &gpu_paddings) { // Construct graph OpsTestNet net; // Add input data - net.AddRandomInput("Input", input_shape); + net.AddRandomInput("Input", input_shape); + net.TransformDataFormat("Input", + NHWC, + "TInput", + NCHW); OpDefBuilder("Pad", "PadTest") - .Input("Input") - .Output("Output") - .AddIntsArg("paddings", paddings) + .Input("TInput") + .Output("TOutput") + .AddIntsArg("paddings", cpu_paddings) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run net.RunOp(); + net.TransformDataFormat("TOutput", + NCHW, + "Output", + NHWC); Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pad", "PadTest") .Input("InputImage") .Output("OutputImage") - .AddIntsArg("paddings", paddings) + .AddIntsArg("paddings", gpu_paddings) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run - net.RunOp(DeviceType::OPENCL); + net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OpenCLOutput", + ImageToBuffer(&net, "OutputImage", "OpenCLOutput", kernels::BufferType::IN_OUT_CHANNEL); auto output = net.GetTensor("OpenCLOutput"); @@ -155,15 +181,21 @@ void Complex(const std::vector &input_shape, } // namespace TEST_F(PadTest, ComplexFloat) { - Complex({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0}); - Complex({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0}); - Complex({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0}); + Complex({1, 32, 32, 4}, + {0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0}); + Complex({1, 31, 37, 16}, + {0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0}); + Complex({1, 128, 128, 32}, + {0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0}); } TEST_F(PadTest, ComplexHalf) { - Complex({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0}); - Complex({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0}); - Complex({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0}); + Complex({1, 32, 32, 4}, + {0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0}); + Complex({1, 31, 37, 16}, + {0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0}); + Complex({1, 128, 128, 32}, + {0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0}); } } // namespace test diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 21dd59c94c35922adea43f7641a13a9d713edbac..ac15bb1b8cb63fbccefc28145f556e90d87b003a 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -26,16 +26,16 @@ void Register_Pooling(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - PoolingOp); + PoolingOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - PoolingOp); + PoolingOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 6c0be19fac0f124a5254a22ab0d06314ca905eeb..1f767c229fd0bc1db2652b7e7d5035804d95831d 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -41,7 +41,7 @@ void Pooling(int iters, if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { @@ -58,7 +58,7 @@ void Pooling(int iters, .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -104,7 +104,7 @@ void Pooling(int iters, #define BM_POOLING(N, C, H, W, K, S, PA, PO) \ BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, OPENCL); + BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 09b5ffb3b378cf82682e0f2e9d123e4ae3cb6d09..9a2c769bc03e2dde4a0956deeb5bc3350fa54b92 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -211,7 +211,7 @@ void SimpleMaxPooling3S2() { NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") @@ -238,7 +238,7 @@ void SimpleMaxPooling3S2() { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { - SimpleMaxPooling3S2(); + SimpleMaxPooling3S2(); } namespace { @@ -304,24 +304,24 @@ void MaxPooling3S2(const std::vector &input_shape, } // namespace TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { - MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); - MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); - MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); - MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) { - MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); - MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); - MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); - MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { - MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::VALID); - MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::VALID); - MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::SAME); - MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::SAME); + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::SAME); } TEST_F(PoolingOpTest, AVG_VALID) { @@ -400,7 +400,7 @@ void SimpleAvgPoolingTest() { } // namespace TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { - SimpleAvgPoolingTest(); + SimpleAvgPoolingTest(); } namespace { @@ -468,43 +468,43 @@ void AvgPoolingTest(const std::vector &shape, } // namespace TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { - AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); - AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) { - AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); - AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) { - AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); - AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) { - AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); - AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { - AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID); - AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { - AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID); - AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME); } diff --git a/mace/ops/reorganize.cc b/mace/ops/reorganize.cc deleted file mode 100644 index d4e3cddecf7122515581bda9bf5e1314224ce194..0000000000000000000000000000000000000000 --- a/mace/ops/reorganize.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/reorganize.h" - -namespace mace { -namespace ops { - -void Register_ReOrganize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReOrganize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReOrganizeOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/reorganize.h b/mace/ops/reorganize.h deleted file mode 100644 index 6f5270446219337e45835862d042d0e122814013..0000000000000000000000000000000000000000 --- a/mace/ops/reorganize.h +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_REORGANIZE_H_ -#define MACE_OPS_REORGANIZE_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/reorganize.h" - -namespace mace { -namespace ops { - -template -class ReOrganizeOp : public Operator { - public: - ReOrganizeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - shape_(OperatorBase::GetRepeatedArgument("shape")) {} - - bool Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const index_t num_dims = shape_.size(); - int unknown_idx = -1; - index_t product = 1; - std::vector out_shape; - - for (int i = 0; i < num_dims; ++i) { - if (shape_[i] == -1) { - MACE_CHECK(unknown_idx == -1) << "Only one input size may be -1"; - unknown_idx = i; - out_shape.push_back(1); - } else { - MACE_CHECK(shape_[i] >= 0) << "Shape must be non-negative: " - << shape_[i]; - out_shape.push_back(shape_[i]); - product *= shape_[i]; - } - } - - if (unknown_idx != -1) { - MACE_CHECK(product != 0) - << "Cannot infer shape if there is zero shape size."; - const index_t missing = input->size() / product; - MACE_CHECK(missing * product == input->size()) - << "Input size not match reshaped tensor size"; - out_shape[unknown_idx] = missing; - } - - Tensor *output = this->Output(OUTPUT); - output->Resize(out_shape); - - functor_(input, out_shape, output, future); - return true; - } - - private: - std::vector shape_; - kernels::ReOrganizeFunctor functor_; - - protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REORGANIZE_H_ diff --git a/mace/ops/reorganize_test.cc b/mace/ops/reorganize_test.cc deleted file mode 100644 index aa25f9986bad8105a3e8f5ca2ad788726160298c..0000000000000000000000000000000000000000 --- a/mace/ops/reorganize_test.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "gmock/gmock.h" -#include "mace/core/operator.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -class ReOrganizeTest : public OpsTestBase {}; - -namespace { -void TestReOrganize(const std::vector &input_shape, - const std::vector &input_data, - const std::vector &output_shape, - const std::vector &output_data) { - const std::vector out_shape(output_shape.begin(), output_shape.end()); - - // Construct graph - OpsTestNet net; - - OpDefBuilder("ReOrganize", "ReOrganizeTest") - .Input("Input") - .Output("Output") - .AddIntsArg("shape", out_shape) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray("Input", - input_shape, input_data); - - // Run - net.RunOp(); - - auto output = net.GetTensor("Output"); - - EXPECT_THAT(output->shape(), ::testing::ContainerEq(output_shape)); - - const float *output_ptr = output->data(); - int size = output->size(); - for (int i = 0; i < size; ++i) { - ASSERT_EQ(output_data[i], output_ptr[i]) << "With Index " << i; - } - - // Reverse reorganzie - const std::vector in_shape(input_shape.begin(), input_shape.end()); - OpDefBuilder("ReOrganize", "ReOrganizeTest") - .Input("Input") - .Output("Output") - .AddIntsArg("shape", in_shape) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray("Input", - output_shape, output_data); - - // Run - net.RunOp(); - - output = net.GetTensor("Output"); - - EXPECT_THAT(output->shape(), ::testing::ContainerEq(input_shape)); - - output_ptr = output->data(); - size = output->size(); - for (int i = 0; i < size; ++i) { - ASSERT_EQ(input_data[i], output_ptr[i]) << "With Index " << i; - } -} -} // namespace - -TEST_F(ReOrganizeTest, Simple) { - TestReOrganize({1, 1, 4, 6}, - {0, 4, 8, 12, 16, 20, - 1, 5, 9, 13, 17, 21, - 2, 6, 10, 14, 18, 22, - 3, 7, 11, 15, 19, 23}, - {1, 1, 8, 3}, - {0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, - 4, 12, 20, 5, 13, 21, 6, 14, 22, 7, 15, 23}); - TestReOrganize({1, 1, 5, 6}, - {0, 5, 10, 15, 20, 25, - 1, 6, 11, 16, 21, 26, - 2, 7, 12, 17, 22, 27, - 3, 8, 13, 18, 23, 28, - 4, 9, 14, 19, 24, 29}, - {1, 1, 10, 3}, - {0, 10, 20, 1, 11, 21, 2, 12, 22, 3, 13, 23, - 4, 14, 24, 5, 15, 25, 6, 16, 26, 7, 17, 27, - 8, 18, 28, 9, 19, 29}); -} - -TEST_F(ReOrganizeTest, Complex) { - TestReOrganize({1, 2, 2, 6}, - {0, 4, 8, 12, 16, 20, - 1, 5, 9, 13, 17, 21, - 2, 6, 10, 14, 18, 22, - 3, 7, 11, 15, 19, 23}, - {1, 2, 6, 2}, - {0, 12, 1, 13, 4, 16, 5, 17, 8, 20, 9, 21, - 2, 14, 3, 15, 6, 18, 7, 19, 10, 22, 11, 23}); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 9368bfecb420e5676f58291494ce4da72c4cbc74..65a84926eae43765ffedd42c2cfbbdf31bcc6fc6 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -26,16 +26,16 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ResizeBilinearOp); + ResizeBilinearOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ResizeBilinearOp); + ResizeBilinearOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 8f5162ca1bb2d6246acff93c386b0dad6dc827bc..add324053b7881dd1c52a62bf21a74d0941a26ff 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -38,7 +38,7 @@ void ResizeBilinearBenchmark(int iters, if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, input_height, input_width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, input_height, input_width, channels}); } else { @@ -55,7 +55,7 @@ void ResizeBilinearBenchmark(int iters, .AddIntsArg("size", {output_height, output_width}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") @@ -99,8 +99,8 @@ void ResizeBilinearBenchmark(int iters, #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, OPENCL); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, OPENCL); + BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ + BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index e5c244673583b072b687df9345addcada2f18867..9b92e89f81318b4729971cb94c68694293c22ecd 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -132,7 +132,7 @@ void TestRandomResizeBilinear() { Tensor expected; expected.Copy(*net.GetOutput("Output")); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -147,8 +147,6 @@ void TestRandomResizeBilinear() { ImageToBuffer(&net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT_CHANNEL); - } else { - // TODO(someone): support NEON } // Check ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), @@ -158,7 +156,7 @@ void TestRandomResizeBilinear() { } // namespace TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) { - TestRandomResizeBilinear(); + TestRandomResizeBilinear(); } } // namespace test diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index 382368579518c028787a990e2236ba3fc4c69e82..a146e00be53168e32882c65a79a0bfc2108afebf 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -26,16 +26,16 @@ void Register_Slice(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SliceOp); + SliceOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SliceOp); + SliceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/slice_benchmark.cc b/mace/ops/slice_benchmark.cc index ff85e4ef9fb48aa3d6ae685b39a82e1c8da91c32..b05138aee2218e7e727d4446f828028404ce1def 100644 --- a/mace/ops/slice_benchmark.cc +++ b/mace/ops/slice_benchmark.cc @@ -38,7 +38,7 @@ void BMSliceHelper(int iters, GenerateRandomRealTypeData(input_shape, &input_data); net.AddInputFromArray("Input", input_shape, input_data); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -85,8 +85,8 @@ void BMSliceHelper(int iters, #define BM_SLICE(N, H, W, C, NO) \ BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ - BM_SLICE_MACRO(N, H, W, C, NO, float, OPENCL); \ - BM_SLICE_MACRO(N, H, W, C, NO, half, OPENCL); + BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ + BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); BM_SLICE(1, 32, 32, 32, 2); BM_SLICE(1, 32, 32, 128, 2); diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc index 27efc56470e32a880875c3b8eefa4104eabeb326..92a54a8649e9cdcc6f1920dd75348cb790dd03e6 100644 --- a/mace/ops/slice_test.cc +++ b/mace/ops/slice_test.cc @@ -51,7 +51,7 @@ void RandomTest(const int num_outputs, const int axis) { GenerateRandomRealTypeData(input_shape, &input_data); net.AddInputFromArray("Input", input_shape, input_data); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -75,7 +75,7 @@ void RandomTest(const int num_outputs, const int axis) { // Run net.RunOp(D); - if (D == DeviceType::OPENCL) { + if (D == DeviceType::GPU) { for (int i = 0; i < num_outputs; ++i) { ImageToBuffer(&net, MakeString("OutputImage", i), @@ -130,15 +130,15 @@ TEST_F(SliceOpTest, CPUAxis1) { } TEST_F(SliceOpTest, OPENCLFloat) { - RandomTest(2, 3); - RandomTest(4, 3); - RandomTest(11, 3); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } TEST_F(SliceOpTest, OPENCLHalf) { - RandomTest(2, 3); - RandomTest(4, 3); - RandomTest(11, 3); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } } // namespace test diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 2f68ed92db13af990e3b9475cfdcc32a10273627..2e2e9e7ff5b44a6038c8d80a5c598db78c31061c 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -26,16 +26,16 @@ void Register_Softmax(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SoftmaxOp); + SoftmaxOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SoftmaxOp); + SoftmaxOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 10abc9493148100b6996bb5d75d282b77804413a..b62eadcb81faaae7e33dda91e2fbf2c9b0640ada 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -33,7 +33,7 @@ void SoftmaxBenchmark( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -44,7 +44,7 @@ void SoftmaxBenchmark( .Input("Input") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -82,8 +82,8 @@ void SoftmaxBenchmark( #define BM_SOFTMAX(N, C, H, W) \ BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \ - BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL); + BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ + BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); BM_SOFTMAX(1, 2, 512, 512); BM_SOFTMAX(1, 3, 512, 512); diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 13781d986c2688795f7736b506a85932cd245e1c..6de118d332ccc454a6a9f19aacf308a1aa33ef77 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -40,7 +40,7 @@ void Simple() { // Run net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -68,7 +68,7 @@ void Simple() { } // namespace TEST_F(SoftmaxOpTest, CPUSimple) { Simple(); } -TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple(); } +TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple(); } namespace { template @@ -114,18 +114,18 @@ void Complex(const std::vector &logits_shape) { } // namespace TEST_F(SoftmaxOpTest, OPENCLAligned) { - Complex({1, 256, 256, 3}); - Complex({1, 128, 128, 16}); + Complex({1, 256, 256, 3}); + Complex({1, 128, 128, 16}); } TEST_F(SoftmaxOpTest, OPENCLMulBatchAligned) { - Complex({5, 64, 64, 3}); - Complex({8, 128, 128, 8}); + Complex({5, 64, 64, 3}); + Complex({8, 128, 128, 8}); } TEST_F(SoftmaxOpTest, OPENCLUnAligned) { - Complex({1, 113, 107, 13}); - Complex({5, 211, 107, 1}); + Complex({1, 113, 107, 13}); + Complex({5, 211, 107, 1}); } } // namespace test diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index 704203b2341b8c18eaa61420185105489773a0de..0564209a28b8071ae02ce6f9fdbb5eef4a7128f2 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -20,16 +20,16 @@ namespace ops { void Register_SpaceToBatchND(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SpaceToBatchNDOp); + SpaceToBatchNDOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SpaceToBatchNDOp); + SpaceToBatchNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index fda5bd709b45a9a859bfef73b89f88b1f1717a6c..ff3ee74925bcbafe037c7af4e732aeb861e0a1e9 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -65,7 +65,7 @@ void BMSpaceToBatch( BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) #define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, OPENCL); + BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 601078e69757d5d0103f204997fa2e7de858f749..dc3e69ed90552cfda15fe65d36d81221e0dd7023 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -85,7 +85,7 @@ void TestBidirectionalTransform(const std::vector &space_shape, const std::vector &batch_shape, const std::vector &batch_data) { auto space_tensor = std::unique_ptr(new Tensor( - GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v())); + GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); space_tensor->Resize(space_shape); { Tensor::MappingGuard space_mapper(space_tensor.get()); @@ -97,7 +97,7 @@ void TestBidirectionalTransform(const std::vector &space_shape, } auto batch_tensor = std::unique_ptr(new Tensor( - GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v())); + GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); batch_tensor->Resize(batch_shape); { Tensor::MappingGuard batch_mapper(batch_tensor.get()); @@ -106,10 +106,10 @@ void TestBidirectionalTransform(const std::vector &space_shape, memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T)); } - RunSpaceToBatch(space_shape, space_data, block_data, + RunSpaceToBatch(space_shape, space_data, block_data, padding_data, batch_tensor.get()); - RunBatchToSpace(batch_shape, batch_data, block_data, + RunBatchToSpace(batch_shape, batch_data, block_data, padding_data, space_tensor.get()); } } // namespace diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 8ffd17b86e184e4b73157474e44ffd8d4dc129c4..2932f8ea3eb9c65da90f076f24441dba06499968 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -26,16 +26,16 @@ void Register_SpaceToDepth(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SpaceToDepthOp); + SpaceToDepthOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - SpaceToDepthOp); + SpaceToDepthOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 6624f88015cd2650c97949914c23b350fb2ae1cd..1b593fafef62497193fb4808835f29e89790bc8d 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -45,7 +45,7 @@ class SpaceToDepthOp : public Operator { input_height = input->dim(2); input_width = input->dim(3); input_depth = input->dim(1); - } else if (D == OPENCL) { + } else if (D == GPU) { input_height = input->dim(1); input_width = input->dim(2); input_depth = input->dim(3); diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index ec0dd870f785b3286ef9cc8c66e2fa0246769a0a..66c90b641b7ad030f63ba89d80dafed6fe4e458a 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -31,7 +31,7 @@ void SpaceToDepth( // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, height, channels, width}); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -42,7 +42,7 @@ void SpaceToDepth( .Input("Input") .Output("Output") .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::OPENCL) { + } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -82,8 +82,8 @@ void SpaceToDepth( #define BM_SPACE_TO_DEPTH(N, C, H, W, G) \ BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, OPENCL); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, OPENCL); + BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ + BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 14fa421d7743809ed723e33bb2a78d6de201df9d..c8f6a4ece45d7ee09f81a2d865233256b47cd1cb 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -147,23 +147,23 @@ void WinogradConvolution(const index_t batch, } // namespace TEST_F(WinogradConvlutionTest, AlignedConvolution) { - WinogradConvolution(1, 32, 32, 32, 16, + WinogradConvolution(1, 32, 32, 32, 16, Padding::VALID); - WinogradConvolution(1, 32, 32, 32, 16, + WinogradConvolution(1, 32, 32, 32, 16, Padding::SAME); } TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { - WinogradConvolution(1, 61, 67, 31, 37, + WinogradConvolution(1, 61, 67, 31, 37, Padding::VALID); - WinogradConvolution(1, 61, 67, 37, 31, + WinogradConvolution(1, 61, 67, 37, 31, Padding::SAME); } TEST_F(WinogradConvlutionTest, BatchConvolution) { - WinogradConvolution(3, 64, 64, 32, 32, + WinogradConvolution(3, 64, 64, 32, 32, Padding::VALID); - WinogradConvolution(5, 61, 67, 37, 31, + WinogradConvolution(5, 61, 67, 37, 31, Padding::SAME); } diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc index 3f33c6c6eb1dd7bbd2c1a24224ca11fd22b5aec7..bcee9d64eb145f3ee7f599a4bd2505c0ca423443 100644 --- a/mace/ops/winograd_inverse_transform.cc +++ b/mace/ops/winograd_inverse_transform.cc @@ -20,16 +20,16 @@ namespace ops { void Register_WinogradInverseTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - WinogradInverseTransformOp); + WinogradInverseTransformOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - WinogradInverseTransformOp); + WinogradInverseTransformOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc index 0be6fee6d1408d897094b3ecdef049f7d4f3862c..5c2d53d4764cf38198f275393dc950e4e36e0bd1 100644 --- a/mace/ops/winograd_transform.cc +++ b/mace/ops/winograd_transform.cc @@ -20,16 +20,16 @@ namespace ops { void Register_WinogradTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - WinogradTransformOp); + WinogradTransformOp); REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::OPENCL) + .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - WinogradTransformOp); + WinogradTransformOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index 3efa7328d4935b8eed0e43d842f988396ea72f44..bf33332bb28db0a8040f0696dd4fc9e297019677 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -62,7 +62,7 @@ void BMWinogradTransform( BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) #define BM_WINOGRAD_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL); + BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); @@ -116,7 +116,7 @@ void BMWinogradInverseTransform( BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) #define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, OPENCL); + BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index c3744edafe7adda5acd6096aec50e70880bb2d44..f2d0d1e47469b65f485aacfbe76beeea717308a4 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -9,8 +9,7 @@ enum NetMode { enum DeviceType { CPU = 0; // In default, we will use CPU. - NEON = 1; - OPENCL = 2; + GPU = 2; } enum DataType { diff --git a/mace/public/mace.h b/mace/public/mace.h index 9bae74cf34fbaba6e2d4626065415044cb8aadf9..02d903fd1a7a9f40395452e40ab91ef74b5ca9be 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -28,7 +28,7 @@ namespace mace { const char *MaceVersion(); -enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 }; +enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 }; diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index f48eec591ba7685de1fc53e2cd356fadf1131b44..f10b548ad24ca1f154fac468459a3bfb33aeeb03 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -475,8 +475,6 @@ class CaffeConverter(object): self.ops_map[final_op.name].children[0].type \ in activation_name_map: activation_op = self.ops_map[final_op.name].children[0] - if not is_depthwise: - op_def.type = "FusedConv2D" fused_act_arg = op_def.arg.add() fused_act_arg.name = 'activation' fused_act_arg.s = activation_name_map[activation_op.type] @@ -984,15 +982,10 @@ class CaffeConverter(object): self.resolved_ops.add(op.name) def convert_reshape(self, op): - if self.device == 'cpu': - op_def = self.CommonConvert(op, 'Reshape') - else: - op_def = self.CommonConvert(op, 'ReOrganize') + op_def = self.CommonConvert(op, 'Reshape') input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] output_shape = input_shape shape_param = np.asarray(op.layer.reshape_param.shape.dim) - if self.device != 'cpu': - shape_param = shape_param[[0, 3, 1, 2]] for i in range(len(shape_param)): if shape_param[i] != 0: output_shape[i] = shape_param[i] diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 31d33c93075378e126110370611a08af45774180..c5668fbebd98a55f71e7e6267e4601734b3f309a 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -508,8 +508,6 @@ class TFConverter(object): if len(self.tf_graph.get(final_op.name, [])) == 1 and \ self.tf_graph[final_op.name][0].type in activation_name_map: activation_op = self.tf_graph[final_op.name][0] - if op_def.type == "Conv2D": - op_def.type = "FusedConv2D" fused_act_arg = op_def.arg.add() fused_act_arg.name = 'activation' fused_act_arg.s = activation_name_map[activation_op.type] @@ -958,14 +956,17 @@ class TFConverter(object): conv_op = self.tf_graph[op.name][0] op_def.name = conv_op.name op_def.type = conv_op.type - self.transpose_filter_tensor[get_input_tensor(conv_op, - 1).name] = (0, 1, 3, 2) + if self.device == 'gpu': + self.transpose_filter_tensor[ + get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2) op_def.input.extend([op.inputs[0].name]) output_name = self.add_buffer_to_image( get_input_tensor(conv_op, 1).name, "CONV2D_FILTER") op_def.input.extend([output_name]) else: + self.transpose_filter_tensor[ + get_input_tensor(conv_op, 1).name] = (3, 2, 0, 1) op_def.input.extend([get_input_tensor(op, 0).name]) op_def.input.extend([get_input_tensor(conv_op, 1).name]) @@ -1020,7 +1021,6 @@ class TFConverter(object): if len(self.tf_graph[final_op.name]) == 1 and \ self.tf_graph[final_op.name][0].type == 'Relu': relu_op = self.tf_graph[final_op.name][0] - op_def.type = "FusedConv2D" fused_relu_arg = op_def.arg.add() fused_relu_arg.name = 'activation' fused_relu_arg.s = "RELU" @@ -1092,8 +1092,12 @@ class TFConverter(object): op_def.output.extend([output.name for output in op.outputs]) paddings_arg = op_def.arg.add() paddings_arg.name = 'paddings' - paddings_arg.ints.extend( - get_input_tensor(op, 1).eval().astype(np.int32).flat) + if self.device == 'gpu': + paddings_value = get_input_tensor(op, 1).eval().astype(np.int32) + else: + paddings_value = get_input_tensor(op, 1).eval().astype(np.int32) + paddings_value = paddings_value[[0, 3, 1, 2]] + paddings_arg.ints.extend(paddings_value.flat) self.unused_tensor.add(get_input_tensor(op, 1).name) if len(op.inputs) == 3: constant_value_arg = op_def.arg.add() diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index fc22a450edfc5a5971d6717e4db01c2bf2dc96ad..33ba50c00d50e8477ce71865ed8422fdab8464d0 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -248,7 +248,7 @@ void MaceRun(const int in_out_size, std::string filter_tensor_name = "filter"; std::string filter_tensor_img_name = filter_tensor_name + "_image"; - const DeviceType device = DeviceType::OPENCL; + const DeviceType device = DeviceType::GPU; NetDef net_def; @@ -300,7 +300,7 @@ void MaceRun(const int in_out_size, } } - CheckOutputs(net_def, inputs, outputs); + CheckOutputs(net_def, inputs, outputs); } } // namespace diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 2a709ededfb34fe95123870d3c28911c8a08591e..1af468e2deec1ca4051b36a12b4cf2757a3d8bcf 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -22,7 +22,7 @@ * --input_file=input_data \ * --output_file=mace.out \ * --model_data_file=model_data.data \ - * --device=OPENCL + * --device=GPU */ #include #include @@ -108,10 +108,8 @@ std::string FormatName(const std::string input) { DeviceType ParseDeviceType(const std::string &device_str) { if (device_str.compare("CPU") == 0) { return DeviceType::CPU; - } else if (device_str.compare("NEON") == 0) { - return DeviceType::NEON; - } else if (device_str.compare("OPENCL") == 0) { - return DeviceType::OPENCL; + } else if (device_str.compare("GPU") == 0) { + return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; } else { @@ -203,7 +201,7 @@ DEFINE_string(output_file, DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); -DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); +DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); @@ -234,7 +232,7 @@ bool RunModel(const std::vector &input_names, FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); #ifdef MACE_ENABLE_OPENCL - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); @@ -252,7 +250,7 @@ bool RunModel(const std::vector &input_names, new FileStorageFactory(kernel_file_path)); SetKVStorageFactory(storage_factory); mace::MaceEngine engine(&net_def, device_type, input_names, output_names); - if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } int64_t t2 = NowMicros(); @@ -329,7 +327,7 @@ bool RunModel(const std::vector &input_names, mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis); #ifdef MACE_ENABLE_OPENCL - if (device_type == DeviceType::OPENCL) { + if (device_type == DeviceType::GPU) { WriteOpenCLPlatformInfo(kernel_file_path); } #endif // MACE_ENABLE_OPENCL diff --git a/tools/mace_tools.py b/tools/mace_tools.py index c8fbcdf4501e77fd147b8ca3ac40464f0f009525..b063c023f8544a46117e0ed7c184e948a12babbd 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -69,7 +69,7 @@ def get_data_and_device_type(runtime): device_type = "HEXAGON" elif runtime == "gpu": data_type = "DT_HALF" - device_type = "OPENCL" + device_type = "GPU" elif runtime == "cpu": data_type = "DT_FLOAT" device_type = "CPU"