From 19dcf2c3678a73cbbb21e34bf1895ca49447c81c Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 7 Nov 2018 11:02:13 +0800 Subject: [PATCH] Feature: Remove the BufferTransform ops of GPU initialization phase. 1. Remove the BufferTransform ops of GPU initialization phase. 2. Add tranformer to tranform the memory type between CPU and GPU. --- mace/core/net.cc | 5 +- mace/core/operator.h | 4 +- mace/core/tensor.h | 11 ++ mace/core/transformer.h | 32 +++++ mace/ops/BUILD | 4 +- mace/ops/activation.cc | 8 ++ mace/ops/activation_test.cc | 4 +- mace/ops/batch_norm.cc | 14 ++ mace/ops/bias_add.cc | 6 + mace/ops/buffer_inverse_transform.cc | 2 +- mace/ops/buffer_transform.cc | 22 +-- mace/ops/conv_2d.cc | 15 ++- mace/ops/depthwise_conv2d.cc | 15 ++- mace/ops/eltwise.cc | 14 ++ mace/ops/fully_connected.cc | 12 ++ .../opencl/buffer/buffer_inverse_transform.h | 4 +- mace/ops/opencl/buffer/buffer_transform.h | 2 +- ..._transform.h => buffer_transform_kernel.h} | 11 +- mace/ops/opencl/buffer_transformer.cc | 27 ++++ mace/ops/opencl/buffer_transformer.h | 126 ++++++++++++++++++ mace/ops/opencl/cl/eltwise.cl | 2 +- mace/ops/opencl/common.h | 1 - mace/ops/opencl/image/buffer_to_image.h | 2 +- mace/ops/opencl/image/image_to_buffer.h | 4 +- mace/ops/transformer.cc | 56 ++++++++ ...ffer_inverse_transform.h => transformer.h} | 33 ++--- 26 files changed, 386 insertions(+), 50 deletions(-) create mode 100644 mace/core/transformer.h rename mace/ops/opencl/{buffer_transform.h => buffer_transform_kernel.h} (83%) create mode 100644 mace/ops/opencl/buffer_transformer.cc create mode 100644 mace/ops/opencl/buffer_transformer.h create mode 100644 mace/ops/transformer.cc rename mace/ops/{opencl/buffer_inverse_transform.h => transformer.h} (55%) diff --git a/mace/core/net.cc b/mace/core/net.cc index 63ca5792..1fe5b0e9 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -77,9 +77,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } MaceStatus SerialNet::Init() { - // TODO(liuqi): where to do memory reuse. MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); OpInitContext init_context(ws_); + // TODO(liuqi): where to do memory reuse. + if (target_device_->device_type() == DeviceType::GPU) { + + } for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; DeviceType device_type = op->device_type(); diff --git a/mace/core/operator.h b/mace/core/operator.h index c354afbd..7017240c 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -131,14 +131,14 @@ class Operation { } inline void set_debug_def( - const std::shared_ptr &operator_def) { + const std::shared_ptr &operator_def) { operator_def_ = operator_def; } inline bool has_debug_def() const { return operator_def_ != nullptr; } protected: - std::shared_ptr operator_def_; + std::shared_ptr operator_def_; std::vector inputs_; std::vector outputs_; diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 7cf01043..f217bee4 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -222,6 +222,17 @@ class Tensor { return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image(); } + inline MemoryType memory_type() const { + MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" ); + if (buffer_->OnHost()) { + return MemoryType::CPU_BUFFER; + } else if (typeid(*buffer_) == typeid(Image)) { + return MemoryType::GPU_IMAGE; + } else { + return MemoryType::GPU_BUFFER; + } + } + #ifdef MACE_ENABLE_OPENCL inline cl::Image *opencl_image() const { MACE_CHECK(has_opencl_image(), name_, " do not have image"); diff --git a/mace/core/transformer.h b/mace/core/transformer.h new file mode 100644 index 00000000..09f56009 --- /dev/null +++ b/mace/core/transformer.h @@ -0,0 +1,32 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_TRANSFORMER_H_ +#define MACE_CORE_TRANSFORMER_H_ + +#include "mace/proto/mace.pb.h" + +namespace mace { + +class TransformerBase { + public: + // Construct transform operation. + virtual std::vector> ConstructTranformOp( + OperatorDef *op_def, + bool transform_filter = true) = 0; +}; + +} // namespace mace + +#endif // MACE_CORE_TRANSFORMER_H_ diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 1e1efc89..5fb0683c 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -31,7 +31,7 @@ cc_library( "ops_registry.cc", "ops_test_util.cc", "buffer_inverse_transform.cc", - "buffer_transform.cc", + "buffer_transformer.cc", "lstm_cell.cc", "winograd_transform.cc", "quantize.cc", @@ -42,7 +42,7 @@ cc_library( "opencl/image/*.cc", "opencl/buffer/*.cc", "buffer_inverse_transform.cc", - "buffer_transform.cc", + "buffer_transformer.cc", "lstm_cell.cc", "winograd_transform.cc", ], diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 7c733177..19b3289f 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/activation.h" #endif // MACE_ENABLE_OPENCL @@ -79,12 +80,19 @@ class ActivationOp : public Operation { "NOOP")); auto relux_max_limit = static_cast( Operation::GetOptionalArg("max_limit", 0.0f)); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset( new opencl::image::ActivationKernel(type, relux_max_limit)); } else { MACE_NOT_IMPLEMENTED; } + if (type == ActivationType::PRELU) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index f56a3a17..f127be42 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -237,12 +237,10 @@ void TestSimplePrelu() { if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Alpha", "AlphaImage", - ops::BufferType::ARGUMENT); OpDefBuilder("Activation", "PreluTest") .Input("InputImage") - .Input("AlphaImage") + .Input("Alpha") .Output("OutputImage") .AddStringArg("activation", "PRELU") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 07c00189..cf022d6a 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/batch_norm.h" #endif // MACE_ENABLE_OPENCL @@ -147,12 +148,25 @@ class BatchNormOp : public Operation { ActivationType activation = ops::StringToActivationType( Operation::GetOptionalArg("activation", "NOOP")); float relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::BatchNormKernel( epsilon, activation, relux_max_limit)); } else { MACE_NOT_IMPLEMENTED; } + // Transform filters + int input_size = operator_def_->input_size(); + for (int i = 0; i < input_size; ++i) { + const Tensor *input_tensor = context->workspace()->GetTensor( + operator_def_->input(i)); + if (input_tensor != nullptr && input_tensor->is_weight()) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } + } } MaceStatus Run(OpContext *context) override { bool not_folded = this->InputSize() == 5; diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 9b528fa9..0b406dd1 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/bias_add.h" #endif // MACE_ENABLE_OPENCL @@ -99,11 +100,16 @@ class BiasAddOp : public Operation { : Operation(context), data_format_(static_cast(Operation::GetOptionalArg( "data_format", NHWC))) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::BiasAddKernel); } else { MACE_NOT_IMPLEMENTED; } + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc index 8cfd72b5..8482e255 100644 --- a/mace/ops/buffer_inverse_transform.cc +++ b/mace/ops/buffer_inverse_transform.cc @@ -51,7 +51,7 @@ class BufferInverseTransformOp : public Operation { private: const int wino_blk_size_; - std::unique_ptr kernel_; + std::unique_ptr kernel_; }; diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index cb127880..1accbe21 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -15,8 +15,7 @@ #include #include "mace/core/operator.h" -#include "mace/ops/opencl/buffer/buffer_transform.h" -#include "mace/ops/opencl/image/buffer_to_image.h" +#include "mace/ops/opencl/buffer_transformer.h" namespace mace { namespace ops { @@ -29,12 +28,16 @@ class BufferTransformOp : public Operation { public: explicit BufferTransformOp(OpConstructContext *context) : Operation(context), - wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)) { + wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)), + out_mem_type_(MemoryType::GPU_BUFFER), + transformer_(nullptr) { + MemoryType in_mem_type = context->workspace()->GetTensor( + operator_def_->input(0))->memory_type(); if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BufferToImage); - } else { - kernel_.reset(new opencl::buffer::BufferTransform); + out_mem_type_ = MemoryType::GPU_IMAGE; } + transformer_.reset(new OpenCLBufferTransformer(in_mem_type, + out_mem_type_)); } MaceStatus Run(OpContext *context) override { @@ -45,13 +48,14 @@ class BufferTransformOp : public Operation { static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(ops::CONV2D_FILTER))); - return kernel_->Compute(context, input, type, - wino_blk_size_, output); + return transformer_->Transform( + context, input, type, wino_blk_size_, out_mem_type_, output); } private: const int wino_blk_size_; - std::unique_ptr kernel_; + MemoryType out_mem_type_; + std::unique_ptr> transformer_; }; diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 7bb213c0..bf5ebaa0 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -38,8 +38,9 @@ #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -#include "mace/ops/opencl/image/conv_2d.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/buffer/conv_2d.h" +#include "mace/ops/opencl/image/conv_2d.h" #endif // MACE_ENABLE_OPENCL namespace mace { @@ -960,11 +961,23 @@ class Conv2dOp : public ConvPool2dOpBase { Operation::GetOptionalArg("activation", "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::Conv2dKernel); } else { + mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::Conv2dKernel); } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 1f1dd136..47f45e4a 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -34,8 +34,9 @@ #include "mace/public/mace.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL -#include "mace/ops/opencl/image/depthwise_conv2d.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/buffer/depthwise_conv2d.h" +#include "mace/ops/opencl/image/depthwise_conv2d.h" #endif // MACE_ENABLE_OPENCL namespace mace { @@ -490,11 +491,23 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::DepthwiseConv2dKernel); } else { + mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 29f0c5a7..96384cde 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -26,6 +26,7 @@ #include "mace/core/tensor.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/eltwise.h" #endif // MACE_ENABLE_OPENCL @@ -1086,12 +1087,25 @@ class EltwiseOp : public Operation { float scalar_input = Operation::GetOptionalArg("scalar_input", 1.0); int32_t scalar_input_index = Operation::GetOptionalArg( "scalar_input_index", 1); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::EltwiseKernel( type, coeff, scalar_input, scalar_input_index)); } else { MACE_NOT_IMPLEMENTED; } + // Transform filters + int input_size = operator_def_->input_size(); + for (int i = 0; i < input_size; ++i) { + const Tensor *input_tensor = context->workspace()->GetTensor( + operator_def_->input(i)); + if (input_tensor != nullptr && input_tensor->is_weight()) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } + } } MaceStatus Run(OpContext *context) override { const Tensor *input0 = this->Input(0); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 3705e415..cea80bf5 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -27,6 +27,7 @@ #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/fully_connected.h" #endif // MACE_ENABLE_OPENCL @@ -192,11 +193,22 @@ class FullyConnectedOp : public FullyConnectedOpBase { public: explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::FullyConnectedKernel); } else { MACE_NOT_IMPLEMENTED; } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h index 647f2514..8b05bf5f 100644 --- a/mace/ops/opencl/buffer/buffer_inverse_transform.h +++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ #define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ -#include "mace/ops/opencl/buffer_inverse_transform.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/core/op_context.h" #include "mace/core/tensor.h" @@ -34,7 +34,7 @@ MaceStatus BufferTypeTransform( Tensor *output); template -class BufferInverseTransform: public OpenCLBufferInverseTransformKernel { +class BufferInverseTransform: public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, const Tensor *input, diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h index 4919bb09..c9e31cfa 100644 --- a/mace/ops/opencl/buffer/buffer_transform.h +++ b/mace/ops/opencl/buffer/buffer_transform.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ #define MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ -#include "mace/ops/opencl/buffer_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include diff --git a/mace/ops/opencl/buffer_transform.h b/mace/ops/opencl/buffer_transform_kernel.h similarity index 83% rename from mace/ops/opencl/buffer_transform.h rename to mace/ops/opencl/buffer_transform_kernel.h index 45808d40..5d4ff094 100644 --- a/mace/ops/opencl/buffer_transform.h +++ b/mace/ops/opencl/buffer_transform_kernel.h @@ -12,18 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ -#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ +#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ +#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ #include "mace/ops/opencl/common.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" namespace mace { - class OpContext; class Tensor; - namespace ops { class OpenCLBufferTransformKernel { public: @@ -32,10 +30,9 @@ class OpenCLBufferTransformKernel { const BufferType type, const int wino_blk_size, Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) }; - } // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ +#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc new file mode 100644 index 00000000..e3b1b67b --- /dev/null +++ b/mace/ops/opencl/buffer_transformer.cc @@ -0,0 +1,27 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/buffer_transformer.h" + +namespace mace { +namespace ops { + +std::string TransformedName(const std::string &name) { + // TODO(liuqi): This may create a conflict. + const char *postfix = "_mace_identity_transformed"; + return name + postfix; +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h new file mode 100644 index 00000000..78f82177 --- /dev/null +++ b/mace/ops/opencl/buffer_transformer.h @@ -0,0 +1,126 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ +#define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ + +#include "mace/core/operator.h" +#include "mace/ops/opencl/common.h" +#include "mace/ops/opencl/image/buffer_to_image.h" +#include "mace/ops/opencl/image/image_to_buffer.h" +#include "mace/ops/opencl/buffer/buffer_transform.h" + +namespace mace { +namespace ops { +// Only used for GPU Operation(BufferTransform) +template +class OpenCLBufferTransformer { + public: + OpenCLBufferTransformer(const MemoryType in_mem_type, + const MemoryType out_mem_type) { + if (out_mem_type == MemoryType::GPU_IMAGE) { + kernel_.reset(new opencl::image::BufferToImage); + } else if (in_mem_type == MemoryType::GPU_IMAGE){ + kernel_.reset(new opencl::image::ImageToBuffer); + } else { + kernel_.reset(new opencl::buffer::BufferTransform); + } + } + + MaceStatus Transform(OpContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + const MemoryType out_mem_type, + Tensor *output) { + Workspace *ws = context->workspace(); + DataType dt = DataTypeToEnum::value; + MemoryType in_mem_type = input->memory_type(); + if (out_mem_type == MemoryType::GPU_IMAGE || + out_mem_type == MemoryType::GPU_BUFFER) { + if (in_mem_type != MemoryType::CPU_BUFFER) { + return kernel_->Compute( + context, input, type, wino_blk_size, output); + } else { + // convert to the GPU Buffer with the input's data type. + Tensor *internal_tensor = ws->CreateTensor( + InternalTransformedName(input->name()), + context->device()->allocator(), input->dtype()); + output->Resize(input->shape()); + const uint8_t *input_ptr = input->data(); + Tensor::MappingGuard guard(internal_tensor); + uint8_t *internal_ptr = internal_tensor->mutable_data(); + memcpy(internal_ptr, input_ptr, input->raw_size()); + // convert the internal GPU Buffer to output. + return kernel_->Compute( + context, internal_tensor, type, wino_blk_size, output); + } + } else { // out_mem_type == MemoryType::CPU_BUFFER + // convert to the GPU Buffer with the output's data type. + Tensor internal_tensor(context->device()->allocator(), + dt, + false, + InternalTransformedName(input->name())); + MACE_RETURN_IF_ERROR(kernel_->Compute( + context, input, type, wino_blk_size, &internal_tensor)); + // convert the internal GPU Buffer to output. + Tensor::MappingGuard guard(&internal_tensor); + const T *internal_ptr = internal_tensor.data(); + output->Resize(internal_tensor.shape()); + T *output_ptr = output->mutable_data(); + memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + return MaceStatus::MACE_SUCCESS; + } + } + + private: + std::string InternalTransformedName(const std::string &name) { + // TODO(liuqi): This may create a conflict. + const char *postfix = "_mace_identity_internal"; + return name + postfix; + } + + private: + std::unique_ptr kernel_; +}; + +std::string TransformedName(const std::string &name); + +template +MaceStatus TransformFilter( + mace::OpConstructContext *context, + OperatorDef *op_def, + const int input_idx, + const BufferType buffer_type, + const MemoryType mem_type) { + const DataType dt = DataTypeToEnum::value; + OpContext op_context(context->workspace(), context->device()); + Workspace *ws = context->workspace(); + std::string input_name = op_def->input(input_idx); + Tensor *input = ws->GetTensor(input_name); + std::string output_name = TransformedName(input_name); + Tensor *output = + ws->CreateTensor(output_name, context->device()->allocator(), dt); + + // update the information + op_def->set_input(input_idx, output_name); + input->MarkUnused(); + return OpenCLBufferTransformer(input->memory_type(), mem_type). + Transform(&op_context, input, buffer_type, 0, mem_type, output); +} + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ diff --git a/mace/ops/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl index 931d0eca..6f352d4f 100644 --- a/mace/ops/opencl/cl/eltwise.cl +++ b/mace/ops/opencl/cl/eltwise.cl @@ -45,7 +45,7 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS DATA_TYPE4 out; #if ELTWISE_TYPE == 0 #ifdef COEFF_SUM - out = mad(coeff1, in0, mad(coeff0, in1, 0)); + out = mad(coeff0, in0, mad(coeff1, in1, 0)); #else out = in0 + in1; #endif diff --git a/mace/ops/opencl/common.h b/mace/ops/opencl/common.h index 0a238960..f0bf872e 100644 --- a/mace/ops/opencl/common.h +++ b/mace/ops/opencl/common.h @@ -29,7 +29,6 @@ enum BufferType { WEIGHT_HEIGHT = 7, WEIGHT_WIDTH = 8, }; - } // namespace ops } // namespace mace #endif // MACE_OPS_OPENCL_COMMON_H_ diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h index 64319721..e84691f8 100644 --- a/mace/ops/opencl/image/buffer_to_image.h +++ b/mace/ops/opencl/image/buffer_to_image.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ #define MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ -#include "mace/ops/opencl/buffer_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include #include diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h index 9aa65f0e..4200087e 100644 --- a/mace/ops/opencl/image/image_to_buffer.h +++ b/mace/ops/opencl/image/image_to_buffer.h @@ -20,7 +20,7 @@ #include #include "mace/core/op_context.h" -#include "mace/ops/opencl/buffer_inverse_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -29,7 +29,7 @@ namespace opencl { namespace image { template -class ImageToBuffer : public OpenCLBufferInverseTransformKernel { +class ImageToBuffer : public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, const Tensor *input, diff --git a/mace/ops/transformer.cc b/mace/ops/transformer.cc new file mode 100644 index 00000000..7df66ffa --- /dev/null +++ b/mace/ops/transformer.cc @@ -0,0 +1,56 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/transformer.h" + +#include +#include + +namespace mace { +namespace ops { + +std::unique_ptr Transformer::DoTransform( + mace::OperatorDef *op_def, + const int input_idx, + const mace::DataType dt, + const BufferType buffer_type, + const MemoryType mem_type) { + int32_t device = op_def->device_type(); + std::string input_name = op_def->input(input_idx); + std::string output_name = input_name + "_transformed"; + + op_def->set_input(input_idx, output_name); + std::unique_ptr op(new OperatorDef); + op->set_name(output_name); + op->set_type("BufferTransform"); + op->add_input(input_name); + op->add_output(output_name); + Argument *arg = op->add_arg(); + arg->set_name("buffer_type"); + arg->set_i(static_cast(buffer_type)); + arg = op->add_arg(); + arg->set_name("mem_type"); + arg->set_i(static_cast(mem_type)); + arg = op->add_arg(); + arg->set_name("T"); + arg->set_i(static_cast(dt)); + arg = op->add_arg(); + arg->set_name("device"); + arg->set_i(device); + + return std::move(op); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer_inverse_transform.h b/mace/ops/transformer.h similarity index 55% rename from mace/ops/opencl/buffer_inverse_transform.h rename to mace/ops/transformer.h index 7f52a64f..67ecd60f 100644 --- a/mace/ops/opencl/buffer_inverse_transform.h +++ b/mace/ops/transformer.h @@ -12,30 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ -#define MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ +#ifndef MACE_KERNELS_TRANSFORMER_H_ +#define MACE_KERNELS_TRANSFORMER_H_ +#include "mace/core/transformer.h" #include "mace/ops/opencl/common.h" -#include "mace/public/mace.h" -#include "mace/utils/utils.h" namespace mace { - class OpContext; -class Tensor; - namespace ops { -class OpenCLBufferInverseTransformKernel { + +class Transformer : public TransformerBase { public: - virtual MaceStatus Compute(OpContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel) + // Transform source tensor to target. + std::vector> ConstructTranformOp( + OperatorDef *op_def, + bool transform_filter = true) override; + private: + std::unique_ptr DoTransform( + mace::OperatorDef *op_def, + const int input_idx, + const mace::DataType dt, + const BufferType buffer_type, + const MemoryType mem_type); }; + } // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ +#endif // MACE_KERNELS_TENSOR_TRANSFORMER_H_ -- GitLab