diff --git a/mace/core/net.cc b/mace/core/net.cc index 63ca5792a9c199448350bab7d623f94c1e2c25b3..1fe5b0e947fb4d7f9f5d82b91e48f6096cdfcb8b 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -77,9 +77,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } MaceStatus SerialNet::Init() { - // TODO(liuqi): where to do memory reuse. MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); OpInitContext init_context(ws_); + // TODO(liuqi): where to do memory reuse. + if (target_device_->device_type() == DeviceType::GPU) { + + } for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; DeviceType device_type = op->device_type(); diff --git a/mace/core/operator.h b/mace/core/operator.h index c354afbdd548c781ec37bb2554c905e03ccb4b18..7017240c8194e9bbe2cac7fe06c85b534683e7f2 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -131,14 +131,14 @@ class Operation { } inline void set_debug_def( - const std::shared_ptr &operator_def) { + const std::shared_ptr &operator_def) { operator_def_ = operator_def; } inline bool has_debug_def() const { return operator_def_ != nullptr; } protected: - std::shared_ptr operator_def_; + std::shared_ptr operator_def_; std::vector inputs_; std::vector outputs_; diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 7cf01043608f5da524308b373420827b3f981f39..f217bee42f7b6615453704c375e79d08cb1c4666 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -222,6 +222,17 @@ class Tensor { return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image(); } + inline MemoryType memory_type() const { + MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" ); + if (buffer_->OnHost()) { + return MemoryType::CPU_BUFFER; + } else if (typeid(*buffer_) == typeid(Image)) { + return MemoryType::GPU_IMAGE; + } else { + return MemoryType::GPU_BUFFER; + } + } + #ifdef MACE_ENABLE_OPENCL inline cl::Image *opencl_image() const { MACE_CHECK(has_opencl_image(), name_, " do not have image"); diff --git a/mace/core/transformer.h b/mace/core/transformer.h new file mode 100644 index 0000000000000000000000000000000000000000..09f56009e0114dd5de9f017a3dbeb66dbff2eea3 --- /dev/null +++ b/mace/core/transformer.h @@ -0,0 +1,32 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_TRANSFORMER_H_ +#define MACE_CORE_TRANSFORMER_H_ + +#include "mace/proto/mace.pb.h" + +namespace mace { + +class TransformerBase { + public: + // Construct transform operation. + virtual std::vector> ConstructTranformOp( + OperatorDef *op_def, + bool transform_filter = true) = 0; +}; + +} // namespace mace + +#endif // MACE_CORE_TRANSFORMER_H_ diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 1e1efc8998ae14a7fab0f25e0ad40aa8e519eee6..5fb0683cf1be1d8936ec411877f4f3492ac1f960 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -31,7 +31,7 @@ cc_library( "ops_registry.cc", "ops_test_util.cc", "buffer_inverse_transform.cc", - "buffer_transform.cc", + "buffer_transformer.cc", "lstm_cell.cc", "winograd_transform.cc", "quantize.cc", @@ -42,7 +42,7 @@ cc_library( "opencl/image/*.cc", "opencl/buffer/*.cc", "buffer_inverse_transform.cc", - "buffer_transform.cc", + "buffer_transformer.cc", "lstm_cell.cc", "winograd_transform.cc", ], diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 7c73317736e83d619980bb4b7925dabc0dcd54bb..19b3289fcb70b16344edd4dcd8f80552ba6f389a 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/activation.h" #endif // MACE_ENABLE_OPENCL @@ -79,12 +80,19 @@ class ActivationOp : public Operation { "NOOP")); auto relux_max_limit = static_cast( Operation::GetOptionalArg("max_limit", 0.0f)); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset( new opencl::image::ActivationKernel(type, relux_max_limit)); } else { MACE_NOT_IMPLEMENTED; } + if (type == ActivationType::PRELU) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index f56a3a17578f3675c38ceb16fec6c98c9c0a18bf..f127be425f9be4478f0fdf7fbadcaeb2ff6bc0a8 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -237,12 +237,10 @@ void TestSimplePrelu() { if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Alpha", "AlphaImage", - ops::BufferType::ARGUMENT); OpDefBuilder("Activation", "PreluTest") .Input("InputImage") - .Input("AlphaImage") + .Input("Alpha") .Output("OutputImage") .AddStringArg("activation", "PRELU") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 07c00189860bafa4dfed989d677db5556c6bf442..cf022d6ae7a2e9cee8bf4368869f8e8eab9faf68 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/batch_norm.h" #endif // MACE_ENABLE_OPENCL @@ -147,12 +148,25 @@ class BatchNormOp : public Operation { ActivationType activation = ops::StringToActivationType( Operation::GetOptionalArg("activation", "NOOP")); float relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::BatchNormKernel( epsilon, activation, relux_max_limit)); } else { MACE_NOT_IMPLEMENTED; } + // Transform filters + int input_size = operator_def_->input_size(); + for (int i = 0; i < input_size; ++i) { + const Tensor *input_tensor = context->workspace()->GetTensor( + operator_def_->input(i)); + if (input_tensor != nullptr && input_tensor->is_weight()) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } + } } MaceStatus Run(OpContext *context) override { bool not_folded = this->InputSize() == 5; diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 9b528fa9f2169fd8268bcc9ea94521e172a807fd..0b406dd1269b2f2ad6925b232b2d845566836c9b 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -19,6 +19,7 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/bias_add.h" #endif // MACE_ENABLE_OPENCL @@ -99,11 +100,16 @@ class BiasAddOp : public Operation { : Operation(context), data_format_(static_cast(Operation::GetOptionalArg( "data_format", NHWC))) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::BiasAddKernel); } else { MACE_NOT_IMPLEMENTED; } + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc index 8cfd72b559d74fcd0a0cf2caf5dbca1dc15a3f5e..8482e2552a55c7e7d681a4e5239d510cc4f2bdfb 100644 --- a/mace/ops/buffer_inverse_transform.cc +++ b/mace/ops/buffer_inverse_transform.cc @@ -51,7 +51,7 @@ class BufferInverseTransformOp : public Operation { private: const int wino_blk_size_; - std::unique_ptr kernel_; + std::unique_ptr kernel_; }; diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index cb127880e99058dfb36faeaa59452351e41303d3..1accbe213585ddb6d8c0058fee076fd191d87f2f 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -15,8 +15,7 @@ #include #include "mace/core/operator.h" -#include "mace/ops/opencl/buffer/buffer_transform.h" -#include "mace/ops/opencl/image/buffer_to_image.h" +#include "mace/ops/opencl/buffer_transformer.h" namespace mace { namespace ops { @@ -29,12 +28,16 @@ class BufferTransformOp : public Operation { public: explicit BufferTransformOp(OpConstructContext *context) : Operation(context), - wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)) { + wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)), + out_mem_type_(MemoryType::GPU_BUFFER), + transformer_(nullptr) { + MemoryType in_mem_type = context->workspace()->GetTensor( + operator_def_->input(0))->memory_type(); if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BufferToImage); - } else { - kernel_.reset(new opencl::buffer::BufferTransform); + out_mem_type_ = MemoryType::GPU_IMAGE; } + transformer_.reset(new OpenCLBufferTransformer(in_mem_type, + out_mem_type_)); } MaceStatus Run(OpContext *context) override { @@ -45,13 +48,14 @@ class BufferTransformOp : public Operation { static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(ops::CONV2D_FILTER))); - return kernel_->Compute(context, input, type, - wino_blk_size_, output); + return transformer_->Transform( + context, input, type, wino_blk_size_, out_mem_type_, output); } private: const int wino_blk_size_; - std::unique_ptr kernel_; + MemoryType out_mem_type_; + std::unique_ptr> transformer_; }; diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 7bb213c0bc70f9a47d0b7c3964b050b76bcccdba..bf5ebaa0c07abd30cc7884bb1b896621d6e67e09 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -38,8 +38,9 @@ #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL -#include "mace/ops/opencl/image/conv_2d.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/buffer/conv_2d.h" +#include "mace/ops/opencl/image/conv_2d.h" #endif // MACE_ENABLE_OPENCL namespace mace { @@ -960,11 +961,23 @@ class Conv2dOp : public ConvPool2dOpBase { Operation::GetOptionalArg("activation", "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::Conv2dKernel); } else { + mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::Conv2dKernel); } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 1f1dd1363770d18c0d590742cba7de7f5ec8d29e..47f45e4a35277726aefe948a7fa5079b0616c2c2 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -34,8 +34,9 @@ #include "mace/public/mace.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL -#include "mace/ops/opencl/image/depthwise_conv2d.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/buffer/depthwise_conv2d.h" +#include "mace/ops/opencl/image/depthwise_conv2d.h" #endif // MACE_ENABLE_OPENCL namespace mace { @@ -490,11 +491,23 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::DepthwiseConv2dKernel); } else { + mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 29f0c5a7ed29834fbd43d3a8951959fa20f1524d..96384cde4fac981064952a7cc7f916e671b63ab6 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -26,6 +26,7 @@ #include "mace/core/tensor.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/eltwise.h" #endif // MACE_ENABLE_OPENCL @@ -1086,12 +1087,25 @@ class EltwiseOp : public Operation { float scalar_input = Operation::GetOptionalArg("scalar_input", 1.0); int32_t scalar_input_index = Operation::GetOptionalArg( "scalar_input_index", 1); + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::EltwiseKernel( type, coeff, scalar_input, scalar_input_index)); } else { MACE_NOT_IMPLEMENTED; } + // Transform filters + int input_size = operator_def_->input_size(); + for (int i = 0; i < input_size; ++i) { + const Tensor *input_tensor = context->workspace()->GetTensor( + operator_def_->input(i)); + if (input_tensor != nullptr && input_tensor->is_weight()) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } + } } MaceStatus Run(OpContext *context) override { const Tensor *input0 = this->Input(0); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 3705e415e59ce17a92da48f5535fe7845d232e3f..cea80bf51e998300b53e5c8729a66aa82147fc0b 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -27,6 +27,7 @@ #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/fully_connected.h" #endif // MACE_ENABLE_OPENCL @@ -192,11 +193,22 @@ class FullyConnectedOp : public FullyConnectedOpBase { public: explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context) { + MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; kernel_.reset(new opencl::image::FullyConnectedKernel); } else { MACE_NOT_IMPLEMENTED; } + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() > 2) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h index 647f251492de7fd0a1947bb183889444bb141028..8b05bf5f0c34e801d501b390b05f64cb4b7e29c8 100644 --- a/mace/ops/opencl/buffer/buffer_inverse_transform.h +++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ #define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ -#include "mace/ops/opencl/buffer_inverse_transform.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/core/op_context.h" #include "mace/core/tensor.h" @@ -34,7 +34,7 @@ MaceStatus BufferTypeTransform( Tensor *output); template -class BufferInverseTransform: public OpenCLBufferInverseTransformKernel { +class BufferInverseTransform: public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, const Tensor *input, diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h index 4919bb099b6dda8229ff9cd052ab1127eba5235b..c9e31cfa04432d3b2758a13993e0850224d7cf43 100644 --- a/mace/ops/opencl/buffer/buffer_transform.h +++ b/mace/ops/opencl/buffer/buffer_transform.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ #define MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ -#include "mace/ops/opencl/buffer_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include diff --git a/mace/ops/opencl/buffer_transform.h b/mace/ops/opencl/buffer_transform_kernel.h similarity index 83% rename from mace/ops/opencl/buffer_transform.h rename to mace/ops/opencl/buffer_transform_kernel.h index 45808d40e43d91732e30fdc8a6889777665bfc58..5d4ff09448cfee8f70af71f2365e43525a9e3087 100644 --- a/mace/ops/opencl/buffer_transform.h +++ b/mace/ops/opencl/buffer_transform_kernel.h @@ -12,18 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ -#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ +#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ +#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ #include "mace/ops/opencl/common.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" namespace mace { - class OpContext; class Tensor; - namespace ops { class OpenCLBufferTransformKernel { public: @@ -32,10 +30,9 @@ class OpenCLBufferTransformKernel { const BufferType type, const int wino_blk_size, Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) }; - } // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_ +#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3b1b67b417b83879b1949874afa2624795d31f0 --- /dev/null +++ b/mace/ops/opencl/buffer_transformer.cc @@ -0,0 +1,27 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/buffer_transformer.h" + +namespace mace { +namespace ops { + +std::string TransformedName(const std::string &name) { + // TODO(liuqi): This may create a conflict. + const char *postfix = "_mace_identity_transformed"; + return name + postfix; +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h new file mode 100644 index 0000000000000000000000000000000000000000..78f82177ee3fd091cf55bfcd79a815f1ebaa925d --- /dev/null +++ b/mace/ops/opencl/buffer_transformer.h @@ -0,0 +1,126 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ +#define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ + +#include "mace/core/operator.h" +#include "mace/ops/opencl/common.h" +#include "mace/ops/opencl/image/buffer_to_image.h" +#include "mace/ops/opencl/image/image_to_buffer.h" +#include "mace/ops/opencl/buffer/buffer_transform.h" + +namespace mace { +namespace ops { +// Only used for GPU Operation(BufferTransform) +template +class OpenCLBufferTransformer { + public: + OpenCLBufferTransformer(const MemoryType in_mem_type, + const MemoryType out_mem_type) { + if (out_mem_type == MemoryType::GPU_IMAGE) { + kernel_.reset(new opencl::image::BufferToImage); + } else if (in_mem_type == MemoryType::GPU_IMAGE){ + kernel_.reset(new opencl::image::ImageToBuffer); + } else { + kernel_.reset(new opencl::buffer::BufferTransform); + } + } + + MaceStatus Transform(OpContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + const MemoryType out_mem_type, + Tensor *output) { + Workspace *ws = context->workspace(); + DataType dt = DataTypeToEnum::value; + MemoryType in_mem_type = input->memory_type(); + if (out_mem_type == MemoryType::GPU_IMAGE || + out_mem_type == MemoryType::GPU_BUFFER) { + if (in_mem_type != MemoryType::CPU_BUFFER) { + return kernel_->Compute( + context, input, type, wino_blk_size, output); + } else { + // convert to the GPU Buffer with the input's data type. + Tensor *internal_tensor = ws->CreateTensor( + InternalTransformedName(input->name()), + context->device()->allocator(), input->dtype()); + output->Resize(input->shape()); + const uint8_t *input_ptr = input->data(); + Tensor::MappingGuard guard(internal_tensor); + uint8_t *internal_ptr = internal_tensor->mutable_data(); + memcpy(internal_ptr, input_ptr, input->raw_size()); + // convert the internal GPU Buffer to output. + return kernel_->Compute( + context, internal_tensor, type, wino_blk_size, output); + } + } else { // out_mem_type == MemoryType::CPU_BUFFER + // convert to the GPU Buffer with the output's data type. + Tensor internal_tensor(context->device()->allocator(), + dt, + false, + InternalTransformedName(input->name())); + MACE_RETURN_IF_ERROR(kernel_->Compute( + context, input, type, wino_blk_size, &internal_tensor)); + // convert the internal GPU Buffer to output. + Tensor::MappingGuard guard(&internal_tensor); + const T *internal_ptr = internal_tensor.data(); + output->Resize(internal_tensor.shape()); + T *output_ptr = output->mutable_data(); + memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + return MaceStatus::MACE_SUCCESS; + } + } + + private: + std::string InternalTransformedName(const std::string &name) { + // TODO(liuqi): This may create a conflict. + const char *postfix = "_mace_identity_internal"; + return name + postfix; + } + + private: + std::unique_ptr kernel_; +}; + +std::string TransformedName(const std::string &name); + +template +MaceStatus TransformFilter( + mace::OpConstructContext *context, + OperatorDef *op_def, + const int input_idx, + const BufferType buffer_type, + const MemoryType mem_type) { + const DataType dt = DataTypeToEnum::value; + OpContext op_context(context->workspace(), context->device()); + Workspace *ws = context->workspace(); + std::string input_name = op_def->input(input_idx); + Tensor *input = ws->GetTensor(input_name); + std::string output_name = TransformedName(input_name); + Tensor *output = + ws->CreateTensor(output_name, context->device()->allocator(), dt); + + // update the information + op_def->set_input(input_idx, output_name); + input->MarkUnused(); + return OpenCLBufferTransformer(input->memory_type(), mem_type). + Transform(&op_context, input, buffer_type, 0, mem_type, output); +} + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ diff --git a/mace/ops/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl index 931d0eca803f3a1fb500cf378768958bcf52d39e..6f352d4f9429a605fc49d6809709c3f6cae34948 100644 --- a/mace/ops/opencl/cl/eltwise.cl +++ b/mace/ops/opencl/cl/eltwise.cl @@ -45,7 +45,7 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS DATA_TYPE4 out; #if ELTWISE_TYPE == 0 #ifdef COEFF_SUM - out = mad(coeff1, in0, mad(coeff0, in1, 0)); + out = mad(coeff0, in0, mad(coeff1, in1, 0)); #else out = in0 + in1; #endif diff --git a/mace/ops/opencl/common.h b/mace/ops/opencl/common.h index 0a238960328182147a310e1065922e07b64c9a2f..f0bf872eb84c4b4dd1705ec0b594b10d987b03a7 100644 --- a/mace/ops/opencl/common.h +++ b/mace/ops/opencl/common.h @@ -29,7 +29,6 @@ enum BufferType { WEIGHT_HEIGHT = 7, WEIGHT_WIDTH = 8, }; - } // namespace ops } // namespace mace #endif // MACE_OPS_OPENCL_COMMON_H_ diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h index 6431972176c2a1b90b3e6362f0c108b8c3ef3c04..e84691f85d1149f5cc87cbc6659b80ae786f2c71 100644 --- a/mace/ops/opencl/image/buffer_to_image.h +++ b/mace/ops/opencl/image/buffer_to_image.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ #define MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ -#include "mace/ops/opencl/buffer_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include #include diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h index 9aa65f0e31273a14ab71299b9ce3faa618cf262b..4200087eeeb6052e2e36a1f63e9ce373dd773cd6 100644 --- a/mace/ops/opencl/image/image_to_buffer.h +++ b/mace/ops/opencl/image/image_to_buffer.h @@ -20,7 +20,7 @@ #include #include "mace/core/op_context.h" -#include "mace/ops/opencl/buffer_inverse_transform.h" +#include "mace/ops/opencl/buffer_transform_kernel.h" #include "mace/ops/opencl/helper.h" namespace mace { @@ -29,7 +29,7 @@ namespace opencl { namespace image { template -class ImageToBuffer : public OpenCLBufferInverseTransformKernel { +class ImageToBuffer : public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, const Tensor *input, diff --git a/mace/ops/transformer.cc b/mace/ops/transformer.cc new file mode 100644 index 0000000000000000000000000000000000000000..7df66ffaf96f79d84d2ef454f16728e959386373 --- /dev/null +++ b/mace/ops/transformer.cc @@ -0,0 +1,56 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/transformer.h" + +#include +#include + +namespace mace { +namespace ops { + +std::unique_ptr Transformer::DoTransform( + mace::OperatorDef *op_def, + const int input_idx, + const mace::DataType dt, + const BufferType buffer_type, + const MemoryType mem_type) { + int32_t device = op_def->device_type(); + std::string input_name = op_def->input(input_idx); + std::string output_name = input_name + "_transformed"; + + op_def->set_input(input_idx, output_name); + std::unique_ptr op(new OperatorDef); + op->set_name(output_name); + op->set_type("BufferTransform"); + op->add_input(input_name); + op->add_output(output_name); + Argument *arg = op->add_arg(); + arg->set_name("buffer_type"); + arg->set_i(static_cast(buffer_type)); + arg = op->add_arg(); + arg->set_name("mem_type"); + arg->set_i(static_cast(mem_type)); + arg = op->add_arg(); + arg->set_name("T"); + arg->set_i(static_cast(dt)); + arg = op->add_arg(); + arg->set_name("device"); + arg->set_i(device); + + return std::move(op); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/buffer_inverse_transform.h b/mace/ops/transformer.h similarity index 55% rename from mace/ops/opencl/buffer_inverse_transform.h rename to mace/ops/transformer.h index 7f52a64f3cf3cce9735ea13bd613e8dc287ae94d..67ecd60f768f9b4326f8f331fa600134d0d8776b 100644 --- a/mace/ops/opencl/buffer_inverse_transform.h +++ b/mace/ops/transformer.h @@ -12,30 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ -#define MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ +#ifndef MACE_KERNELS_TRANSFORMER_H_ +#define MACE_KERNELS_TRANSFORMER_H_ +#include "mace/core/transformer.h" #include "mace/ops/opencl/common.h" -#include "mace/public/mace.h" -#include "mace/utils/utils.h" namespace mace { - class OpContext; -class Tensor; - namespace ops { -class OpenCLBufferInverseTransformKernel { + +class Transformer : public TransformerBase { public: - virtual MaceStatus Compute(OpContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel) + // Transform source tensor to target. + std::vector> ConstructTranformOp( + OperatorDef *op_def, + bool transform_filter = true) override; + private: + std::unique_ptr DoTransform( + mace::OperatorDef *op_def, + const int input_idx, + const mace::DataType dt, + const BufferType buffer_type, + const MemoryType mem_type); }; + } // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ +#endif // MACE_KERNELS_TENSOR_TRANSFORMER_H_